[llvm] [X86] Check test changes due to allowing schedulers (PR #189587)

Gergo Stomfai via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 31 03:28:47 PDT 2026


https://github.com/stomfaig created https://github.com/llvm/llvm-project/pull/189587

It seems that X86 always defaults to `SourceListDAGScheduler` due to a hook not being overridden. 

Below are the changes due to overriding this hook, and choosing schedulers according to what the logic in `X86TargetLowering::X86TargetLowering` would imply.

In a few cases in which currently 32 and 64 bit code agrees one should also add new prefixes, but there already 600+ files changed, so I have not done this yet.

>From ba29353db52bac10c023232c96ccacd4daac8918 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Tue, 31 Mar 2026 11:19:28 +0100
Subject: [PATCH] check test changes due to allowing sched'ers

---
 llvm/lib/Target/X86/X86Subtarget.h            |     2 +
 .../CodeGen/X86/2006-03-01-InstrSchedBug.ll   |     8 +-
 .../X86/2006-05-08-CoalesceSubRegClass.ll     |     9 +-
 .../test/CodeGen/X86/2006-05-08-InstrSched.ll |    14 +-
 .../CodeGen/X86/2006-08-21-ExtraMovInst.ll    |     2 +-
 .../test/CodeGen/X86/2007-02-16-BranchFold.ll |    64 +-
 .../CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll    |     4 +-
 .../X86/2007-10-12-CoalesceExtSubReg.ll       |     8 +-
 .../CodeGen/X86/2007-10-12-SpillerUnfold1.ll  |    45 +-
 .../CodeGen/X86/2007-11-30-LoadFolding-Bug.ll |     4 +-
 .../X86/2008-02-22-LocalRegAllocBug.ll        |     6 +-
 llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll  |    10 +-
 .../X86/2008-06-13-VolatileLoadStore.ll       |     8 +-
 .../CodeGen/X86/2008-12-02-dagcombine-3.ll    |     2 +-
 .../CodeGen/X86/2008-12-23-crazy-address.ll   |     6 +-
 llvm/test/CodeGen/X86/2009-01-31-BigShift.ll  |    31 +-
 llvm/test/CodeGen/X86/2009-01-31-BigShift2.ll |    10 +-
 .../X86/2009-04-29-IndirectDestOperands.ll    |     4 +-
 .../X86/2009-06-05-VariableIndexInsert.ll     |     4 +-
 llvm/test/CodeGen/X86/2011-05-09-loaduse.ll   |     2 +-
 .../CodeGen/X86/2011-10-19-widen_vselect.ll   |     4 +-
 .../X86/2012-01-10-UndefExceptionEdge.ll      |     2 +-
 .../test/CodeGen/X86/2012-01-12-extract-sv.ll |    24 +-
 llvm/test/CodeGen/X86/2012-07-10-extload64.ll |     2 +-
 llvm/test/CodeGen/X86/3addr-16bit.ll          |    20 +-
 .../CodeGen/X86/64-bit-shift-by-32-minus-y.ll |   112 +-
 .../X86/8bit_cmov_of_trunc_promotion.ll       |    80 +-
 .../X86/DynamicCalleeSavedRegisters.ll        |     8 +-
 .../CodeGen/X86/MergeConsecutiveStores.ll     |    68 +-
 llvm/test/CodeGen/X86/SwitchLowering.ll       |    25 +-
 llvm/test/CodeGen/X86/WidenArith.ll           |     4 +-
 llvm/test/CodeGen/X86/abds-neg.ll             |   376 +-
 llvm/test/CodeGen/X86/abds.ll                 |   215 +-
 llvm/test/CodeGen/X86/abdu-neg.ll             |   277 +-
 llvm/test/CodeGen/X86/abdu.ll                 |   205 +-
 llvm/test/CodeGen/X86/abs.ll                  |   463 +-
 llvm/test/CodeGen/X86/add-and-not.ll          |    27 +-
 llvm/test/CodeGen/X86/add-sub-bool.ll         |    62 +-
 llvm/test/CodeGen/X86/add.ll                  |    34 +-
 llvm/test/CodeGen/X86/addr-mode-matcher-3.ll  |    14 +-
 llvm/test/CodeGen/X86/addr-mode-matcher-4.ll  |     6 +-
 .../CodeGen/X86/addsub-constant-folding.ll    |    55 +-
 llvm/test/CodeGen/X86/align-down-const.ll     |    10 +-
 llvm/test/CodeGen/X86/align-down.ll           |    36 +-
 llvm/test/CodeGen/X86/and-mask-variable.ll    |    82 +-
 llvm/test/CodeGen/X86/and-or-fold.ll          |     6 +-
 llvm/test/CodeGen/X86/and-or-setcc.ll         |   100 +-
 llvm/test/CodeGen/X86/and-sink.ll             |    12 +-
 llvm/test/CodeGen/X86/and-su.ll               |     4 +-
 llvm/test/CodeGen/X86/andimm8.ll              |    14 +-
 llvm/test/CodeGen/X86/andnot-blsmsk.ll        |     4 +-
 llvm/test/CodeGen/X86/andnot-patterns.ll      |    34 +-
 llvm/test/CodeGen/X86/andnot-sink-not.ll      |  1930 +-
 llvm/test/CodeGen/X86/apx/i386-ndd.ll         |     2 +-
 llvm/test/CodeGen/X86/arg-copy-elide.ll       |    24 +-
 llvm/test/CodeGen/X86/arithmetic_fence.ll     |    24 +-
 llvm/test/CodeGen/X86/arithmetic_fence2.ll    |    55 +-
 llvm/test/CodeGen/X86/atom-fixup-lea3.ll      |    48 +
 llvm/test/CodeGen/X86/atomic-bit-test.ll      |    15 +-
 llvm/test/CodeGen/X86/atomic-fp.ll            |    16 +-
 .../X86/atomic-idempotent-syncscope.ll        |    49 +-
 llvm/test/CodeGen/X86/atomic-idempotent.ll    |    90 +-
 .../CodeGen/X86/atomic-load-store-wide.ll     |    10 +-
 llvm/test/CodeGen/X86/atomic-mi.ll            |    28 +-
 llvm/test/CodeGen/X86/atomic-minmax-i6432.ll  |     8 +-
 llvm/test/CodeGen/X86/atomic-non-integer.ll   |    91 +-
 llvm/test/CodeGen/X86/atomic-op.ll            |   255 +-
 llvm/test/CodeGen/X86/atomic-rm-bit-test.ll   |  1023 +-
 .../CodeGen/X86/atomicrmw-cond-sub-clamp.ll   |    88 +-
 llvm/test/CodeGen/X86/avx-cvt-3.ll            |    16 +-
 .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll |    26 +-
 llvm/test/CodeGen/X86/avx-intrinsics-x86.ll   |     8 +-
 llvm/test/CodeGen/X86/avx-select.ll           |     4 +-
 llvm/test/CodeGen/X86/avx-vbroadcast.ll       |    68 +-
 llvm/test/CodeGen/X86/avx-vbroadcastf128.ll   |    18 +-
 llvm/test/CodeGen/X86/avx.ll                  |    14 +-
 .../CodeGen/X86/avx1-logical-load-folding.ll  |    14 +-
 llvm/test/CodeGen/X86/avx10_2-cmp.ll          |    15 +-
 .../test/CodeGen/X86/avx10_2_512bf16-arith.ll |    20 +-
 .../CodeGen/X86/avx10_2_512bf16-intrinsics.ll |     4 +-
 .../CodeGen/X86/avx10_2_512ni-intrinsics.ll   |     9 +-
 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll    |    40 +-
 .../CodeGen/X86/avx10_2fptosi_satcvtds.ll     |    38 +-
 llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll |    18 +-
 .../CodeGen/X86/avx10_2satcvtds-intrinsics.ll |    76 +-
 .../test/CodeGen/X86/avx2-fma-fneg-combine.ll |    47 +-
 llvm/test/CodeGen/X86/avx2-gather.ll          |     4 +-
 .../X86/avx2-intrinsics-x86-upgrade.ll        |     2 +-
 llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll  |    76 +-
 llvm/test/CodeGen/X86/avx2-masked-gather.ll   |    32 +-
 llvm/test/CodeGen/X86/avx2-nontemporal.ll     |    40 +-
 llvm/test/CodeGen/X86/avx2-vbroadcast.ll      |    24 +-
 llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll  |    52 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll  |   741 +-
 .../CodeGen/X86/avx512-gfni-intrinsics.ll     |   284 +-
 llvm/test/CodeGen/X86/avx512-intel-ocl.ll     |     8 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |   124 +-
 .../CodeGen/X86/avx512-intrinsics-upgrade.ll  |   898 +-
 llvm/test/CodeGen/X86/avx512-intrinsics.ll    |   559 +-
 llvm/test/CodeGen/X86/avx512-load-store.ll    |    48 +-
 llvm/test/CodeGen/X86/avx512-mask-op.ll       |   154 +-
 llvm/test/CodeGen/X86/avx512-regcall-Mask.ll  |    24 +-
 .../test/CodeGen/X86/avx512-regcall-NoMask.ll |   128 +-
 llvm/test/CodeGen/X86/avx512-select.ll        |   112 +-
 .../X86/avx512-shuffles/shuffle-blend.ll      |   174 +-
 llvm/test/CodeGen/X86/avx512bf16-mov.ll       |    16 +-
 .../X86/avx512bf16-vl-intrinsics-upgrade.ll   |     2 +-
 .../CodeGen/X86/avx512bf16-vl-intrinsics.ll   |     2 +-
 .../X86/avx512bw-intrinsics-fast-isel.ll      |    20 +-
 .../X86/avx512bw-intrinsics-upgrade.ll        |   134 +-
 llvm/test/CodeGen/X86/avx512bw-intrinsics.ll  |   155 +-
 .../X86/avx512bwvl-intrinsics-upgrade.ll      |  1070 +-
 .../test/CodeGen/X86/avx512bwvl-intrinsics.ll |    59 +-
 llvm/test/CodeGen/X86/avx512dq-intrinsics.ll  |   174 +-
 .../X86/avx512dqvl-intrinsics-upgrade.ll      |    52 +-
 .../test/CodeGen/X86/avx512dqvl-intrinsics.ll |    48 +-
 llvm/test/CodeGen/X86/avx512fp16-cvt.ll       |    31 +-
 .../CodeGen/X86/avx512fp16-fma-intrinsics.ll  |    14 +-
 llvm/test/CodeGen/X86/avx512fp16-mov.ll       |   110 +-
 .../X86/avx512ifma-intrinsics-upgrade.ll      |    32 +-
 .../test/CodeGen/X86/avx512ifma-intrinsics.ll |    32 +-
 .../X86/avx512vbmi2-intrinsics-upgrade.ll     |    14 +-
 .../CodeGen/X86/avx512vbmi2-intrinsics.ll     |    14 +-
 .../X86/avx512vbmi2vl-intrinsics-upgrade.ll   |   194 +-
 .../CodeGen/X86/avx512vbmi2vl-intrinsics.ll   |    86 +-
 .../X86/avx512vl-intrinsics-upgrade.ll        |  1562 +-
 llvm/test/CodeGen/X86/avx512vl-intrinsics.ll  |   292 +-
 .../X86/avx512vl_vnni-intrinsics-upgrade.ll   |    32 +-
 .../CodeGen/X86/avx512vl_vnni-intrinsics.ll   |    32 +-
 .../X86/avx512vlvp2intersect-intrinsics.ll    |   238 +-
 .../X86/avx512vp2intersect-intrinsics.ll      |    96 +-
 .../CodeGen/X86/avxvnniint8-intrinsics.ll     |   192 +-
 .../X86/bfloat-calling-conv-no-sse2.ll        |   232 +-
 llvm/test/CodeGen/X86/bfloat.ll               |   114 +-
 llvm/test/CodeGen/X86/bitcast-mmx.ll          |    28 +-
 llvm/test/CodeGen/X86/bitreverse.ll           |  1930 +-
 llvm/test/CodeGen/X86/bitselect.ll            |    39 +-
 llvm/test/CodeGen/X86/bittest-big-integer.ll  |   897 +-
 llvm/test/CodeGen/X86/blend-of-shift.ll       |   152 +-
 llvm/test/CodeGen/X86/bmi-out-of-order.ll     |   125 +-
 llvm/test/CodeGen/X86/bmi-select-distrib.ll   |    70 +-
 llvm/test/CodeGen/X86/bmi.ll                  |    80 +-
 llvm/test/CodeGen/X86/bmi2.ll                 |    52 +-
 llvm/test/CodeGen/X86/bool-vector.ll          |    40 +-
 .../X86/broadcast-elm-cross-splat-vec.ll      |  1079 +-
 llvm/test/CodeGen/X86/bsf.ll                  |   121 +-
 llvm/test/CodeGen/X86/bsr.ll                  |   197 +-
 llvm/test/CodeGen/X86/bswap-wide-int.ll       |    80 +-
 llvm/test/CodeGen/X86/bswap.ll                |   167 +-
 llvm/test/CodeGen/X86/bswap_tree2.ll          |    32 +-
 llvm/test/CodeGen/X86/bt.ll                   |    76 +-
 llvm/test/CodeGen/X86/btc_bts_btr.ll          |   292 +-
 llvm/test/CodeGen/X86/build-vector-128.ll     |    52 +-
 llvm/test/CodeGen/X86/build-vector-256.ll     |    12 +-
 llvm/test/CodeGen/X86/build-vector-512.ll     |    26 +-
 .../CodeGen/X86/build_fp16_constant_vector.ll |    10 +-
 llvm/test/CodeGen/X86/byref.ll                |    15 +-
 llvm/test/CodeGen/X86/byval2.ll               |    16 +-
 llvm/test/CodeGen/X86/byval3.ll               |    12 +-
 llvm/test/CodeGen/X86/byval4.ll               |    12 +-
 llvm/test/CodeGen/X86/byval5.ll               |    20 +-
 llvm/test/CodeGen/X86/canonicalize-vars.ll    |   181 +-
 llvm/test/CodeGen/X86/clear-highbits.ll       |   232 +-
 llvm/test/CodeGen/X86/clear-lowbits.ll        |   299 +-
 llvm/test/CodeGen/X86/clmul-x86.ll            |    74 +-
 llvm/test/CodeGen/X86/cmov-fp.ll              |   288 +-
 llvm/test/CodeGen/X86/cmovcmov.ll             |    60 +-
 llvm/test/CodeGen/X86/cmp16.ll                |    14 +-
 llvm/test/CodeGen/X86/cmpf-avx.ll             |    36 +-
 llvm/test/CodeGen/X86/cmpxchg8b.ll            |     6 +-
 llvm/test/CodeGen/X86/coalescer-commute1.ll   |    14 +-
 llvm/test/CodeGen/X86/combine-adc.ll          |    30 +-
 llvm/test/CodeGen/X86/combine-bswap.ll        |    18 +-
 llvm/test/CodeGen/X86/combine-fneg.ll         |    16 +-
 llvm/test/CodeGen/X86/combine-multiplies.ll   |    52 +-
 llvm/test/CodeGen/X86/combine-sbb.ll          |    50 +-
 .../CodeGen/X86/conditional-tailcall-pgso.ll  |     2 +-
 llvm/test/CodeGen/X86/conditional-tailcall.ll |     4 +-
 .../CodeGen/X86/const-shift-of-constmasked.ll |    58 +-
 llvm/test/CodeGen/X86/copy-eflags.ll          |    37 +-
 llvm/test/CodeGen/X86/ctlo.ll                 |    46 +-
 llvm/test/CodeGen/X86/ctlz.ll                 |    88 +-
 llvm/test/CodeGen/X86/cttz.ll                 |    30 +-
 llvm/test/CodeGen/X86/cvtv2f32.ll             |    18 +-
 .../CodeGen/X86/dagcombine-buildvector.ll     |     4 +-
 llvm/test/CodeGen/X86/dagcombine-cse.ll       |    49 +-
 .../test/CodeGen/X86/dagcombine-dead-store.ll |     2 +-
 llvm/test/CodeGen/X86/dagcombine-shifts.ll    |    65 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |   702 +-
 .../div-rem-pair-recomposition-unsigned.ll    |   674 +-
 llvm/test/CodeGen/X86/div_i129_v_pow2k.ll     |    91 +-
 llvm/test/CodeGen/X86/divide-by-constant.ll   |    16 +-
 llvm/test/CodeGen/X86/divrem.ll               |   106 +-
 llvm/test/CodeGen/X86/divrem8_ext.ll          |     4 +-
 llvm/test/CodeGen/X86/dollar-name.ll          |     2 +-
 llvm/test/CodeGen/X86/emutls-pic.ll           |   200 +-
 .../CodeGen/X86/expand-vp-cast-intrinsics.ll  |    10 +-
 .../CodeGen/X86/expand-vp-fp-intrinsics.ll    |    28 +-
 .../CodeGen/X86/expand-vp-int-intrinsics.ll   |   100 +-
 llvm/test/CodeGen/X86/extract-bits.ll         |  4066 ++--
 llvm/test/CodeGen/X86/extract-lowbits.ll      |  1155 +-
 llvm/test/CodeGen/X86/extract-store.ll        |    72 +-
 llvm/test/CodeGen/X86/extractelement-fp.ll    |     4 +-
 ...ractelement-legalization-store-ordering.ll |    26 +-
 llvm/test/CodeGen/X86/extractelement-load.ll  |    32 +-
 llvm/test/CodeGen/X86/fildll.ll               |     2 +-
 llvm/test/CodeGen/X86/fixup-bw-copy.ll        |     4 +-
 llvm/test/CodeGen/X86/fixup-lea.ll            |    10 +
 llvm/test/CodeGen/X86/flt-rounds.ll           |    63 +-
 llvm/test/CodeGen/X86/fma.ll                  |     8 +-
 llvm/test/CodeGen/X86/fmf-flags.ll            |     9 +-
 llvm/test/CodeGen/X86/fminimum-fmaximum.ll    |   180 +-
 .../CodeGen/X86/fminimumnum-fmaximumnum.ll    |   182 +-
 llvm/test/CodeGen/X86/fmuladd-soft-float.ll   |   879 +-
 llvm/test/CodeGen/X86/fold-and-shift.ll       |    38 +-
 llvm/test/CodeGen/X86/fold-load.ll            |    10 +-
 llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll       |    34 +-
 llvm/test/CodeGen/X86/fold-tied-op.ll         |   118 +-
 llvm/test/CodeGen/X86/fp-arith.ll             |    48 +-
 llvm/test/CodeGen/X86/fp-intrinsics.ll        |    14 +-
 llvm/test/CodeGen/X86/fp-stack-2results.ll    |    26 +-
 llvm/test/CodeGen/X86/fp-stack-ret-conv.ll    |    12 +-
 .../CodeGen/X86/fp-strict-libcalls-msvc32.ll  |     3 -
 .../CodeGen/X86/fp-strict-scalar-cmp-fp16.ll  |   112 +-
 llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll |   722 +-
 .../test/CodeGen/X86/fp-strict-scalar-fp16.ll |     8 +-
 .../CodeGen/X86/fp-strict-scalar-fptoint.ll   |     4 +-
 .../CodeGen/X86/fp-strict-scalar-inttofp.ll   |     8 +-
 llvm/test/CodeGen/X86/fp-strict-scalar.ll     |    42 +-
 llvm/test/CodeGen/X86/fp128-cast-strict.ll    |   338 +-
 llvm/test/CodeGen/X86/fp128-cast.ll           |   500 +-
 .../test/CodeGen/X86/fp128-libcalls-strict.ll |  1400 +-
 llvm/test/CodeGen/X86/fp128-libcalls.ll       |  1656 +-
 llvm/test/CodeGen/X86/fp16-libcalls.ll        |   232 +-
 llvm/test/CodeGen/X86/fp80-strict-libcalls.ll |    17 +-
 .../CodeGen/X86/fp80-strict-scalar-cmp.ll     |    24 +-
 llvm/test/CodeGen/X86/fptosi-sat-scalar.ll    |  1220 +-
 llvm/test/CodeGen/X86/fptoui-sat-scalar.ll    |  1130 +-
 llvm/test/CodeGen/X86/freeze-binary.ll        |    34 +-
 llvm/test/CodeGen/X86/freeze-unary.ll         |    22 +-
 llvm/test/CodeGen/X86/freeze-vector.ll        |   174 +-
 llvm/test/CodeGen/X86/fshl-splat-undef.ll     |     4 +-
 llvm/test/CodeGen/X86/fshl.ll                 |   246 +-
 llvm/test/CodeGen/X86/fshr.ll                 |   243 +-
 llvm/test/CodeGen/X86/fsxor-alignment.ll      |    16 +-
 llvm/test/CodeGen/X86/ftrunc.ll               |     6 +-
 llvm/test/CodeGen/X86/funnel-shift-rot.ll     |     2 +-
 llvm/test/CodeGen/X86/funnel-shift.ll         |   178 +-
 llvm/test/CodeGen/X86/gather-addresses.ll     |    56 +-
 llvm/test/CodeGen/X86/ghc-cc.ll               |    43 +-
 llvm/test/CodeGen/X86/gpr-to-mask.ll          |    12 +-
 llvm/test/CodeGen/X86/h-registers-2.ll        |     2 +-
 llvm/test/CodeGen/X86/half-constrained.ll     |     6 +-
 llvm/test/CodeGen/X86/half.ll                 |   265 +-
 llvm/test/CodeGen/X86/hipe-cc.ll              |     2 +-
 ...st-and-by-const-from-lshr-in-eqcmp-zero.ll |    50 +-
 ...ist-and-by-const-from-shl-in-eqcmp-zero.ll |    58 +-
 .../CodeGen/X86/horizontal-reduce-smax.ll     |    70 +-
 .../CodeGen/X86/horizontal-reduce-smin.ll     |    62 +-
 .../CodeGen/X86/horizontal-reduce-umax.ll     |   321 +-
 .../CodeGen/X86/horizontal-reduce-umin.ll     |   340 +-
 llvm/test/CodeGen/X86/horizontal-shuffle-4.ll |    37 +-
 .../X86/horizontal-shuffle-demanded.ll        |    32 +-
 llvm/test/CodeGen/X86/i128-add.ll             |    32 +-
 llvm/test/CodeGen/X86/i128-fp128-abi.ll       |   390 +-
 llvm/test/CodeGen/X86/i128-mul.ll             |    39 +-
 llvm/test/CodeGen/X86/i128-sdiv.ll            |   399 +-
 llvm/test/CodeGen/X86/i128-udiv.ll            |  2938 +--
 llvm/test/CodeGen/X86/i386-baseptr.ll         |     8 +-
 llvm/test/CodeGen/X86/i386-shrink-wrapping.ll |     4 +-
 llvm/test/CodeGen/X86/i64-mem-copy.ll         |    38 +-
 .../CodeGen/X86/i686-win-shrink-wrapping.ll   |     4 +-
 llvm/test/CodeGen/X86/iabs.ll                 |    37 +-
 llvm/test/CodeGen/X86/icmp-abs-C.ll           |    62 +-
 .../test/CodeGen/X86/icmp-pow2-logic-npow2.ll |    54 +-
 llvm/test/CodeGen/X86/icmp-shift-opt.ll       |    64 +-
 .../CodeGen/X86/illegal-bitfield-loadstore.ll |    27 +-
 llvm/test/CodeGen/X86/immediate_merging.ll    |    10 +-
 llvm/test/CodeGen/X86/inalloca-stdcall.ll     |    26 +-
 llvm/test/CodeGen/X86/inalloca.ll             |    65 +-
 llvm/test/CodeGen/X86/inline-asm-fpstack.ll   |    10 +-
 llvm/test/CodeGen/X86/insert.ll               |    24 +-
 .../CodeGen/X86/insertelement-duplicates.ll   |     8 +-
 .../CodeGen/X86/insertelement-legalize.ll     |    16 +-
 .../CodeGen/X86/insertelement-var-index.ll    |   190 +-
 .../CodeGen/X86/insertps-unfold-load-bug.ll   |     7 +-
 llvm/test/CodeGen/X86/int-to-fp-demanded.ll   |    14 +-
 llvm/test/CodeGen/X86/invpcid-intrinsic.ll    |     4 +-
 llvm/test/CodeGen/X86/ipra-local-linkage-2.ll |    89 +-
 llvm/test/CodeGen/X86/is_fpclass-fp80.ll      |   186 +-
 llvm/test/CodeGen/X86/is_fpclass.ll           |   102 +-
 llvm/test/CodeGen/X86/isel-and.ll             |     6 +-
 llvm/test/CodeGen/X86/isel-fabs.ll            |     2 +-
 llvm/test/CodeGen/X86/isel-fneg.ll            |    59 +-
 llvm/test/CodeGen/X86/isel-llvm.atan2.ll      |     6 +-
 llvm/test/CodeGen/X86/isel-llvm.sincos.ll     |   139 +-
 llvm/test/CodeGen/X86/isel-or.ll              |     2 +-
 llvm/test/CodeGen/X86/isel-sdiv.ll            |    11 -
 llvm/test/CodeGen/X86/isel-select-cmov.ll     |     8 +-
 llvm/test/CodeGen/X86/isel-select-fcmov.ll    |    14 +-
 llvm/test/CodeGen/X86/isel-sint-to-fp-x87.ll  |    28 +-
 llvm/test/CodeGen/X86/isel-srem.ll            |    37 +-
 llvm/test/CodeGen/X86/isel-udiv.ll            |    11 -
 llvm/test/CodeGen/X86/isel-urem.ll            |    37 +-
 llvm/test/CodeGen/X86/isel-x87.ll             |    71 +-
 llvm/test/CodeGen/X86/isel-xor.ll             |     6 +-
 llvm/test/CodeGen/X86/jump_sign.ll            |    12 +-
 llvm/test/CodeGen/X86/keylocker-intrinsics.ll |    78 +-
 llvm/test/CodeGen/X86/known-bits-vector.ll    |    20 +-
 llvm/test/CodeGen/X86/known-bits.ll           |    40 +-
 llvm/test/CodeGen/X86/known-never-zero.ll     |   456 +-
 .../test/CodeGen/X86/known-signbits-vector.ll |    44 +-
 .../X86/lack-of-signed-truncation-check.ll    |    18 +-
 llvm/test/CodeGen/X86/ldexp-libcall.ll        |     6 +-
 llvm/test/CodeGen/X86/ldexp-not-readonly.ll   |     4 +-
 .../test/CodeGen/X86/ldexp-wrong-signature.ll |     8 +-
 .../CodeGen/X86/ldexp-wrong-signature2.ll     |     8 +-
 llvm/test/CodeGen/X86/ldexp.ll                |   102 +-
 llvm/test/CodeGen/X86/lea-2.ll                |     6 +-
 llvm/test/CodeGen/X86/lea-opt-cse1.ll         |     2 +-
 llvm/test/CodeGen/X86/lea-opt-cse3.ll         |     8 +-
 llvm/test/CodeGen/X86/lea-opt-cse4.ll         |     2 +-
 .../test/CodeGen/X86/lea-opt-memop-check-1.ll |    39 +-
 llvm/test/CodeGen/X86/legalize-shift-64.ll    |    92 +-
 llvm/test/CodeGen/X86/legalize-shl-vec.ll     |   150 +-
 llvm/test/CodeGen/X86/llrint-conv.ll          |   126 +-
 llvm/test/CodeGen/X86/llround-conv.ll         |    32 +-
 llvm/test/CodeGen/X86/llvm.frexp.ll           |    40 +-
 llvm/test/CodeGen/X86/llvm.sincos.vec.ll      |    86 +-
 llvm/test/CodeGen/X86/load-combine.ll         |    58 +-
 llvm/test/CodeGen/X86/localescape.ll          |   149 +
 .../CodeGen/X86/loop-strength-reduce-2.ll     |    46 +
 .../CodeGen/X86/loop-strength-reduce-3.ll     |    20 +
 llvm/test/CodeGen/X86/loop-strength-reduce.ll |    20 +
 .../test/CodeGen/X86/loop-strength-reduce5.ll |     2 +-
 .../test/CodeGen/X86/loop-strength-reduce7.ll |     4 +-
 llvm/test/CodeGen/X86/lrint-conv-i32.ll       |    64 +-
 llvm/test/CodeGen/X86/lrint-conv-i64.ll       |    97 +-
 llvm/test/CodeGen/X86/lround-conv-i32.ll      |    16 +-
 llvm/test/CodeGen/X86/lround-conv-i64.ll      |    16 +-
 llvm/test/CodeGen/X86/lwp-intrinsics.ll       |    20 +-
 .../test/CodeGen/X86/masked_gather_scatter.ll |   485 +-
 llvm/test/CodeGen/X86/masked_load.ll          |   100 +-
 llvm/test/CodeGen/X86/masked_store.ll         |    94 +-
 llvm/test/CodeGen/X86/materialize.ll          |   829 +-
 llvm/test/CodeGen/X86/mcu-abi.ll              |    14 +-
 llvm/test/CodeGen/X86/memcmp-mergeexpand.ll   |    12 +-
 llvm/test/CodeGen/X86/memcmp-minsize-x32.ll   |   141 +-
 .../CodeGen/X86/memcmp-more-load-pairs-x32.ll |  1342 +-
 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll   |   202 +-
 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll      |   202 +-
 llvm/test/CodeGen/X86/memcmp-x32.ll           |   833 +-
 llvm/test/CodeGen/X86/memcpy-2.ll             |    70 +-
 llvm/test/CodeGen/X86/memset-2.ll             |    24 +-
 .../X86/memset-sse-stack-realignment.ll       |    12 +-
 llvm/test/CodeGen/X86/memset64-on-x86-32.ll   |     4 +-
 .../X86/merge-consecutive-loads-128.ll        |   343 +-
 .../X86/merge-consecutive-loads-256.ll        |    12 +-
 .../X86/merge-consecutive-stores-nt.ll        |   104 +-
 llvm/test/CodeGen/X86/merge-sp-update-lea.ll  |    27 +
 .../test/CodeGen/X86/merge-store-constants.ll |    14 +-
 llvm/test/CodeGen/X86/mfence.ll               |     2 +-
 llvm/test/CodeGen/X86/midpoint-int.ll         |   433 +-
 llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll |    19 +-
 llvm/test/CodeGen/X86/mmx-arith.ll            |    30 +-
 llvm/test/CodeGen/X86/mmx-build-vector.ll     |    80 +-
 llvm/test/CodeGen/X86/mmx-cvt.ll              |    14 +-
 llvm/test/CodeGen/X86/mmx-fold-load.ll        |   104 +-
 llvm/test/CodeGen/X86/mmx-fold-zero.ll        |     2 +-
 llvm/test/CodeGen/X86/movtopush.ll            |  1494 ++
 llvm/test/CodeGen/X86/mul-constant-i16.ll     |     4 +-
 llvm/test/CodeGen/X86/mul-constant-i32.ll     |    12 +-
 llvm/test/CodeGen/X86/mul-constant-i64.ll     |    24 +-
 llvm/test/CodeGen/X86/mul-constant-result.ll  |   392 +-
 llvm/test/CodeGen/X86/mul-i1024.ll            |  4945 +++--
 llvm/test/CodeGen/X86/mul-i256.ll             |   392 +-
 llvm/test/CodeGen/X86/mul-i512.ll             |  1119 +-
 llvm/test/CodeGen/X86/mul128.ll               |    65 +-
 llvm/test/CodeGen/X86/musttail-inalloca.ll    |     3 +-
 llvm/test/CodeGen/X86/musttail-struct.ll      |     8 +-
 llvm/test/CodeGen/X86/musttail-varargs.ll     |     6 +-
 llvm/test/CodeGen/X86/musttail.ll             |    34 +-
 llvm/test/CodeGen/X86/neg-abs.ll              |   107 +-
 llvm/test/CodeGen/X86/neg_fp.ll               |    23 +-
 llvm/test/CodeGen/X86/negate-add-zero.ll      |    52 +-
 llvm/test/CodeGen/X86/no-ret-in-x87-reg.ll    |    56 +-
 llvm/test/CodeGen/X86/nomovtopush.ll          |    14 +-
 llvm/test/CodeGen/X86/nontemporal.ll          |   112 +-
 llvm/test/CodeGen/X86/nosse-vector.ll         |   199 +-
 llvm/test/CodeGen/X86/not-of-dec.ll           |     2 +-
 llvm/test/CodeGen/X86/not-shift.ll            |    72 +-
 llvm/test/CodeGen/X86/optimize-max-0.ll       |   332 +-
 llvm/test/CodeGen/X86/or-lea.ll               |    36 +-
 llvm/test/CodeGen/X86/overflow.ll             |    25 +-
 llvm/test/CodeGen/X86/packss.ll               |   242 +-
 llvm/test/CodeGen/X86/packus.ll               |   746 +-
 llvm/test/CodeGen/X86/partial-fold32.ll       |    59 +-
 llvm/test/CodeGen/X86/peep-test-3.ll          |    16 +-
 .../X86/peephole-na-phys-copy-folding.ll      |    22 +-
 llvm/test/CodeGen/X86/pic.ll                  |   207 +-
 llvm/test/CodeGen/X86/pmovsx-inreg.ll         |    48 +-
 llvm/test/CodeGen/X86/pointer-vector.ll       |     8 +-
 llvm/test/CodeGen/X86/popcnt.ll               |   434 +-
 llvm/test/CodeGen/X86/powi-const.ll           |    10 +-
 llvm/test/CodeGen/X86/powi.ll                 |     6 +-
 llvm/test/CodeGen/X86/pr111170.ll             |    22 +-
 llvm/test/CodeGen/X86/pr118934.ll             |    22 +-
 llvm/test/CodeGen/X86/pr134607.ll             |     2 +-
 llvm/test/CodeGen/X86/pr14314.ll              |    10 +-
 llvm/test/CodeGen/X86/pr147781.ll             |     4 +-
 llvm/test/CodeGen/X86/pr15309.ll              |    30 +-
 llvm/test/CodeGen/X86/pr179489.ll             |    10 +-
 llvm/test/CodeGen/X86/pr18344.ll              |    30 +-
 llvm/test/CodeGen/X86/pr20011.ll              |     8 +-
 llvm/test/CodeGen/X86/pr22338.ll              |    14 +-
 llvm/test/CodeGen/X86/pr22970.ll              |    16 +-
 llvm/test/CodeGen/X86/pr25725.ll              |     2 +-
 llvm/test/CodeGen/X86/pr2656.ll               |     8 +-
 llvm/test/CodeGen/X86/pr28824.ll              |     4 +-
 llvm/test/CodeGen/X86/pr29170.ll              |     2 +-
 llvm/test/CodeGen/X86/pr30284.ll              |    12 +-
 llvm/test/CodeGen/X86/pr31088.ll              |    16 +-
 llvm/test/CodeGen/X86/pr32282.ll              |    28 +-
 llvm/test/CodeGen/X86/pr32284.ll              |    59 +-
 llvm/test/CodeGen/X86/pr32329.ll              |    65 +-
 llvm/test/CodeGen/X86/pr32345.ll              |     4 +-
 llvm/test/CodeGen/X86/pr32610.ll              |    30 +-
 llvm/test/CodeGen/X86/pr32659.ll              |    12 +-
 llvm/test/CodeGen/X86/pr34080-2.ll            |    70 +-
 llvm/test/CodeGen/X86/pr34088.ll              |     2 +-
 llvm/test/CodeGen/X86/pr34421.ll              |     4 +-
 llvm/test/CodeGen/X86/pr3457.ll               |    19 +-
 llvm/test/CodeGen/X86/pr34605.ll              |    16 +-
 llvm/test/CodeGen/X86/pr34855.ll              |     2 +-
 llvm/test/CodeGen/X86/pr35918.ll              |     8 +-
 llvm/test/CodeGen/X86/pr35972.ll              |     8 +-
 llvm/test/CodeGen/X86/pr35982.ll              |    14 +-
 llvm/test/CodeGen/X86/pr38539.ll              |   308 +-
 llvm/test/CodeGen/X86/pr38738.ll              |    50 +-
 llvm/test/CodeGen/X86/pr38795.ll              |     2 +-
 llvm/test/CodeGen/X86/pr38819.ll              |     6 +-
 llvm/test/CodeGen/X86/pr40539.ll              |    12 +-
 llvm/test/CodeGen/X86/pr44396.ll              |     3 +-
 llvm/test/CodeGen/X86/pr44915.ll              |    44 +-
 llvm/test/CodeGen/X86/pr46527.ll              |    14 +-
 llvm/test/CodeGen/X86/pr49028.ll              |     2 +-
 llvm/test/CodeGen/X86/pr49451.ll              |    14 +-
 llvm/test/CodeGen/X86/pr50782.ll              |     4 +-
 .../CodeGen/X86/pr51878_computeAliasing.ll    |     2 +-
 llvm/test/CodeGen/X86/pr53419.ll              |    80 +-
 llvm/test/CodeGen/X86/pr57283.ll              |     2 +-
 llvm/test/CodeGen/X86/pr59305.ll              |    11 +-
 llvm/test/CodeGen/X86/pr62145.ll              |     4 +-
 llvm/test/CodeGen/X86/pr69965.ll              |    10 +-
 llvm/test/CodeGen/X86/pr78897.ll              |    58 +-
 llvm/test/CodeGen/X86/pr89877.ll              |     6 +-
 llvm/test/CodeGen/X86/promote-i16.ll          |     6 +-
 llvm/test/CodeGen/X86/promote-vec3.ll         |    18 +-
 .../CodeGen/X86/pseudo_cmov_lower-fp16.ll     |    72 +-
 llvm/test/CodeGen/X86/pseudo_cmov_lower.ll    |  1941 +-
 llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll   |     7 +-
 .../CodeGen/X86/pull-binop-through-shift.ll   |    48 +-
 .../pull-conditional-binop-through-shift.ll   |    72 +-
 llvm/test/CodeGen/X86/rdrand.ll               |    18 +-
 llvm/test/CodeGen/X86/rdseed.ll               |    18 +-
 llvm/test/CodeGen/X86/rdtsc-upgrade.ll        |     2 +-
 llvm/test/CodeGen/X86/rdtsc.ll                |     2 +-
 .../X86/regalloc-advanced-split-cost.ll       |    49 +-
 .../CodeGen/X86/replace-load-and-with-bzhi.ll |    12 +-
 llvm/test/CodeGen/X86/retpoline-external.ll   |   258 +
 llvm/test/CodeGen/X86/retpoline.ll            |  1388 +-
 llvm/test/CodeGen/X86/rev16.ll                |    40 +-
 llvm/test/CodeGen/X86/rot16.ll                |    26 +-
 .../test/CodeGen/X86/rotate-extract-vector.ll |    99 +-
 llvm/test/CodeGen/X86/rotate-extract.ll       |    32 +-
 llvm/test/CodeGen/X86/rotate.ll               |    70 +-
 llvm/test/CodeGen/X86/rotate4.ll              |    85 +-
 llvm/test/CodeGen/X86/sadd_sat.ll             |    62 +-
 llvm/test/CodeGen/X86/sadd_sat_plus.ll        |    16 +-
 llvm/test/CodeGen/X86/sar_fold.ll             |     4 +-
 llvm/test/CodeGen/X86/scalar-ext-logic.ll     |    16 +-
 llvm/test/CodeGen/X86/scalar-extract.ll       |     4 +-
 llvm/test/CodeGen/X86/scalar-fp-to-i32.ll     |   136 +-
 llvm/test/CodeGen/X86/scalar-fp-to-i64.ll     |   172 +-
 llvm/test/CodeGen/X86/scalar-int-to-fp.ll     |    12 +-
 llvm/test/CodeGen/X86/scmp.ll                 |  1087 +-
 llvm/test/CodeGen/X86/sdiv_fix.ll             |   271 +-
 llvm/test/CodeGen/X86/sdiv_fix_sat.ll         |   465 +-
 .../CodeGen/X86/segmented-stacks-dynamic.ll   |     4 +-
 llvm/test/CodeGen/X86/seh-catch-all-win32.ll  |    56 +
 llvm/test/CodeGen/X86/seh-stack-realign.ll    |    60 +
 llvm/test/CodeGen/X86/select-constant-xor.ll  |    13 +-
 llvm/test/CodeGen/X86/select-lea.ll           |    78 +-
 llvm/test/CodeGen/X86/select-mmx.ll           |     4 +-
 llvm/test/CodeGen/X86/select-smin-smax.ll     |   112 +-
 llvm/test/CodeGen/X86/select.ll               |   340 +-
 llvm/test/CodeGen/X86/select_const.ll         |    92 +-
 llvm/test/CodeGen/X86/setcc.ll                |    10 +-
 llvm/test/CodeGen/X86/sext-i1.ll              |    10 +-
 llvm/test/CodeGen/X86/shift-amount-mod.ll     |   329 +-
 llvm/test/CodeGen/X86/shift-and.ll            |    49 +-
 llvm/test/CodeGen/X86/shift-bmi2.ll           |    18 +-
 llvm/test/CodeGen/X86/shift-by-signext.ll     |     4 +-
 llvm/test/CodeGen/X86/shift-coalesce.ll       |     4 +-
 llvm/test/CodeGen/X86/shift-combine.ll        |   190 +-
 llvm/test/CodeGen/X86/shift-double.ll         |    12 +-
 llvm/test/CodeGen/X86/shift-folding.ll        |     2 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |   705 +-
 llvm/test/CodeGen/X86/shift-i256.ll           |  1401 +-
 llvm/test/CodeGen/X86/shift-mask.ll           |     2 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   936 +-
 llvm/test/CodeGen/X86/shrink_vmul_sse.ll      |    22 +-
 llvm/test/CodeGen/X86/shuffle-blendw.ll       |     4 +-
 llvm/test/CodeGen/X86/sibcall.ll              |     3 +-
 .../CodeGen/X86/signed-truncation-check.ll    |    18 +-
 llvm/test/CodeGen/X86/sincos-stack-args.ll    |     5 +-
 llvm/test/CodeGen/X86/sink-addsub-of-const.ll |    24 +-
 llvm/test/CodeGen/X86/slow-pmulld.ll          |   833 +-
 llvm/test/CodeGen/X86/slow-unaligned-mem.ll   |    93 +-
 llvm/test/CodeGen/X86/smax.ll                 |   428 +-
 llvm/test/CodeGen/X86/smin.ll                 |   437 +-
 llvm/test/CodeGen/X86/smul-with-overflow.ll   |   557 +-
 llvm/test/CodeGen/X86/smul_fix.ll             |    68 +-
 llvm/test/CodeGen/X86/smul_fix_sat.ll         |   191 +-
 .../X86/smulo-128-legalisation-lowering.ll    |  1143 +-
 llvm/test/CodeGen/X86/split-vector-bitcast.ll |    16 +-
 .../CodeGen/X86/srem-seteq-illegal-types.ll   |    56 +-
 llvm/test/CodeGen/X86/srem-seteq-optsize.ll   |     6 +-
 llvm/test/CodeGen/X86/sse-fcopysign.ll        |    32 +-
 llvm/test/CodeGen/X86/sse-intel-ocl.ll        |    56 +-
 .../CodeGen/X86/sse-intrinsics-fast-isel.ll   |   204 +-
 llvm/test/CodeGen/X86/sse-only.ll             |     4 +-
 llvm/test/CodeGen/X86/sse-regcall.ll          |   182 +-
 llvm/test/CodeGen/X86/sse-regcall4.ll         |   183 +-
 llvm/test/CodeGen/X86/sse1.ll                 |   103 +-
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |    38 +-
 .../X86/sse2-intrinsics-x86-upgrade.ll        |    12 +-
 llvm/test/CodeGen/X86/sse2.ll                 |   177 +-
 llvm/test/CodeGen/X86/sse3.ll                 |    16 +-
 .../CodeGen/X86/sse41-intrinsics-fast-isel.ll |     9 -
 llvm/test/CodeGen/X86/sse41.ll                |   336 +-
 llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll |    18 +-
 llvm/test/CodeGen/X86/sshl_sat.ll             |    20 +-
 llvm/test/CodeGen/X86/sshl_sat_vec.ll         |   708 +-
 llvm/test/CodeGen/X86/ssub_sat.ll             |    52 +-
 llvm/test/CodeGen/X86/ssub_sat_plus.ll        |    20 +-
 llvm/test/CodeGen/X86/stack-align-memcpy.ll   |   156 +-
 llvm/test/CodeGen/X86/stack-align.ll          |    21 +-
 .../CodeGen/X86/stack-clash-dynamic-alloca.ll |     2 +-
 llvm/test/CodeGen/X86/stack-clash-large.ll    |    55 +-
 llvm/test/CodeGen/X86/stack-coloring-wineh.ll |    12 +-
 llvm/test/CodeGen/X86/stack-protector-msvc.ll |    12 +-
 llvm/test/CodeGen/X86/store-narrow.ll         |     8 +-
 .../CodeGen/X86/store-zero-and-minus-one.ll   |    18 +-
 llvm/test/CodeGen/X86/storetrunc-fp.ll        |     2 +-
 llvm/test/CodeGen/X86/strict-fsub-combines.ll |    12 +-
 llvm/test/CodeGen/X86/sttni.ll                |   187 +-
 llvm/test/CodeGen/X86/sub-of-bias.ll          |    12 +-
 llvm/test/CodeGen/X86/subvector-broadcast.ll  |   166 +-
 llvm/test/CodeGen/X86/swifterror.ll           |   104 +-
 llvm/test/CodeGen/X86/tailcall-tailcc.ll      |    56 +-
 llvm/test/CodeGen/X86/tailcall.ll             |    22 +-
 llvm/test/CodeGen/X86/test-shrink-bug.ll      |     4 +-
 llvm/test/CodeGen/X86/tls-desc.ll             |     8 +-
 llvm/test/CodeGen/X86/trunc-srl-load.ll       |    76 +-
 llvm/test/CodeGen/X86/trunc-to-bool.ll        |     2 +-
 llvm/test/CodeGen/X86/uadd_sat.ll             |    50 +-
 llvm/test/CodeGen/X86/uadd_sat_plus.ll        |    18 +-
 llvm/test/CodeGen/X86/ucmp.ll                 |  1258 +-
 llvm/test/CodeGen/X86/udiv_fix.ll             |    95 +-
 llvm/test/CodeGen/X86/udiv_fix_sat.ll         |   131 +-
 llvm/test/CodeGen/X86/uint64-to-float.ll      |     4 +-
 llvm/test/CodeGen/X86/uint_to_fp.ll           |     6 +-
 llvm/test/CodeGen/X86/umax.ll                 |   825 +-
 llvm/test/CodeGen/X86/umin.ll                 |   437 +-
 llvm/test/CodeGen/X86/umul-with-overflow.ll   |   467 +-
 llvm/test/CodeGen/X86/umul_fix.ll             |    60 +-
 llvm/test/CodeGen/X86/umul_fix_sat.ll         |   141 +-
 .../X86/umulo-128-legalisation-lowering.ll    |    58 +-
 .../X86/umulo-64-legalisation-lowering.ll     |    22 +-
 llvm/test/CodeGen/X86/urem-seteq-optsize.ll   |     4 +-
 llvm/test/CodeGen/X86/ushl_sat.ll             |    88 +-
 llvm/test/CodeGen/X86/ushl_sat_vec.ll         |   540 +-
 llvm/test/CodeGen/X86/usub_sat.ll             |    54 +-
 llvm/test/CodeGen/X86/usub_sat_plus.ll        |    26 +-
 llvm/test/CodeGen/X86/v2f32.ll                |     2 +-
 llvm/test/CodeGen/X86/v8i1-masks.ll           |   492 +-
 llvm/test/CodeGen/X86/vaargs-win32.ll         |     4 +-
 .../X86/variable-sized-darwin-bzero.ll        |     4 +-
 llvm/test/CodeGen/X86/vec-strict-cmp-128.ll   |   200 +-
 .../test/CodeGen/X86/vec-strict-cmp-sub128.ll |    24 +-
 .../CodeGen/X86/vec-strict-fptoint-128.ll     |    20 +-
 .../CodeGen/X86/vec-strict-inttofp-128.ll     |     4 +-
 .../CodeGen/X86/vec-strict-inttofp-256.ll     |    34 +-
 llvm/test/CodeGen/X86/vec3-setcc-crash.ll     |     4 +-
 llvm/test/CodeGen/X86/vec_anyext.ll           |    32 +-
 llvm/test/CodeGen/X86/vec_extract-avx.ll      |     8 +-
 llvm/test/CodeGen/X86/vec_extract-sse4.ll     |     6 +-
 llvm/test/CodeGen/X86/vec_extract.ll          |    44 +-
 llvm/test/CodeGen/X86/vec_fabs.ll             |    24 +-
 llvm/test/CodeGen/X86/vec_fcopysign.ll        |    88 +-
 llvm/test/CodeGen/X86/vec_fneg.ll             |     4 +-
 llvm/test/CodeGen/X86/vec_fpext.ll            |    46 +-
 llvm/test/CodeGen/X86/vec_fptrunc.ll          |    38 +-
 llvm/test/CodeGen/X86/vec_ins_extract-1.ll    |     8 +-
 llvm/test/CodeGen/X86/vec_insert-4.ll         |     4 +-
 llvm/test/CodeGen/X86/vec_insert-5.ll         |     2 +-
 llvm/test/CodeGen/X86/vec_insert-8.ll         |     4 +-
 llvm/test/CodeGen/X86/vec_logical.ll          |    12 +-
 llvm/test/CodeGen/X86/vec_set.ll              |     2 +-
 llvm/test/CodeGen/X86/vec_shift5.ll           |    63 +-
 llvm/test/CodeGen/X86/vec_ss_load_fold.ll     |    10 +-
 llvm/test/CodeGen/X86/vec_zero.ll             |     2 +-
 llvm/test/CodeGen/X86/vec_zero_cse.ll         |    12 +-
 llvm/test/CodeGen/X86/vector-extend-inreg.ll  |    54 +-
 llvm/test/CodeGen/X86/vector-fshl-128.ll      |    14 +-
 llvm/test/CodeGen/X86/vector-fshl-rot-128.ll  |     2 +-
 llvm/test/CodeGen/X86/vector-fshr-128.ll      |     2 +-
 llvm/test/CodeGen/X86/vector-fshr-rot-128.ll  |     2 +-
 llvm/test/CodeGen/X86/vector-gep.ll           |   138 +-
 llvm/test/CodeGen/X86/vector-idiv-v2i32.ll    |    88 +-
 llvm/test/CodeGen/X86/vector-llrint.ll        |  1541 +-
 llvm/test/CodeGen/X86/vector-lrint.ll         |  2269 +-
 llvm/test/CodeGen/X86/vector-mul.ll           |    28 +-
 llvm/test/CodeGen/X86/vector-reduce-and.ll    |    66 +-
 .../test/CodeGen/X86/vector-reduce-or-bool.ll |    20 +-
 llvm/test/CodeGen/X86/vector-reduce-or.ll     |    66 +-
 .../CodeGen/X86/vector-reduce-xor-bool.ll     |    20 +-
 llvm/test/CodeGen/X86/vector-reduce-xor.ll    |    66 +-
 llvm/test/CodeGen/X86/vector-rotate-128.ll    |    34 +-
 llvm/test/CodeGen/X86/vector-sext.ll          |    12 +-
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |    10 +-
 .../test/CodeGen/X86/vector-shuffle-avx512.ll |    36 +-
 .../X86/vector-shuffle-combining-avx.ll       |   109 +-
 .../X86/vector-shuffle-combining-avx2.ll      |   359 +-
 .../X86/vector-shuffle-combining-avx512bw.ll  |    86 +-
 .../vector-shuffle-combining-avx512bwvl.ll    |    26 +-
 .../X86/vector-shuffle-combining-avx512f.ll   |    24 +-
 .../vector-shuffle-combining-avx512vbmi.ll    |     6 +-
 .../X86/vector-shuffle-combining-avx512vl.ll  |    60 +-
 .../X86/vector-shuffle-combining-xop.ll       |     4 +-
 llvm/test/CodeGen/X86/vector-shuffle-mmx.ll   |    14 +-
 .../X86/vp2intersect_multiple_pairs.ll        |    62 +-
 llvm/test/CodeGen/X86/vshift-1.ll             |    16 +-
 llvm/test/CodeGen/X86/vshift-2.ll             |    16 +-
 llvm/test/CodeGen/X86/vshift-3.ll             |    14 +-
 llvm/test/CodeGen/X86/vshift-4.ll             |    18 +-
 llvm/test/CodeGen/X86/vshift-5.ll             |     8 +-
 llvm/test/CodeGen/X86/vshift-6.ll             |     2 +-
 llvm/test/CodeGen/X86/vshift_split2.ll        |     4 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 17816 +++++++---------
 .../X86/wide-scalar-shift-legalization.ll     |  6384 +++---
 ...ad-of-small-alloca-with-zero-upper-half.ll |  2513 ++-
 .../CodeGen/X86/widen-load-of-small-alloca.ll |  1501 +-
 llvm/test/CodeGen/X86/widen_arith-1.ll        |     2 +-
 llvm/test/CodeGen/X86/widen_arith-3.ll        |     6 +-
 llvm/test/CodeGen/X86/widen_arith-6.ll        |     6 +-
 llvm/test/CodeGen/X86/widen_cast-1.ll         |     2 +-
 llvm/test/CodeGen/X86/widen_cast-2.ll         |    10 +-
 llvm/test/CodeGen/X86/widen_cast-3.ll         |     2 +-
 llvm/test/CodeGen/X86/widen_cast-5.ll         |     2 +-
 llvm/test/CodeGen/X86/widen_compare-1.ll      |     4 +-
 llvm/test/CodeGen/X86/widen_conv-1.ll         |    10 +-
 llvm/test/CodeGen/X86/widen_conv-2.ll         |     2 +-
 llvm/test/CodeGen/X86/widen_conv-3.ll         |    22 +-
 llvm/test/CodeGen/X86/widen_conv-4.ll         |    42 +-
 llvm/test/CodeGen/X86/widen_extract-1.ll      |     2 +-
 llvm/test/CodeGen/X86/widen_load-0.ll         |    10 +-
 llvm/test/CodeGen/X86/widen_load-2.ll         |    80 +-
 llvm/test/CodeGen/X86/widen_load-3.ll         |    82 +-
 llvm/test/CodeGen/X86/widen_shuffle-1.ll      |     8 +-
 llvm/test/CodeGen/X86/win-catchpad.ll         |   294 +
 llvm/test/CodeGen/X86/win-smallparams.ll      |    38 +-
 llvm/test/CodeGen/X86/win32-eh.ll             |   172 +-
 .../CodeGen/X86/win32-int-runtime-libcalls.ll |    56 +-
 llvm/test/CodeGen/X86/win32_sret.ll           |    93 +-
 llvm/test/CodeGen/X86/x32-va_start.ll         |    11 +-
 llvm/test/CodeGen/X86/x86-32-intrcc.ll        |    28 +-
 .../CodeGen/X86/x86-32-vector-calling-conv.ll |    12 +-
 llvm/test/CodeGen/X86/x86-shifts.ll           |   255 +-
 llvm/test/CodeGen/X86/x87.ll                  |    29 +-
 llvm/test/CodeGen/X86/xaluo128.ll             |   112 +-
 llvm/test/CodeGen/X86/xmulo.ll                |   605 +-
 llvm/test/CodeGen/X86/xor-icmp.ll             |    20 +-
 llvm/test/CodeGen/X86/xor-lea.ll              |     2 +-
 llvm/test/CodeGen/X86/xor.ll                  |     4 +-
 .../CodeGen/X86/zero-call-used-regs-i386.ll   |     6 +-
 llvm/test/CodeGen/X86/zext-fold.ll            |     7 +-
 llvm/test/CodeGen/X86/zext-lshr.ll            |     4 +-
 llvm/test/CodeGen/X86/zext-shl.ll             |     4 +-
 690 files changed, 65203 insertions(+), 60393 deletions(-)

diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 692c7938ddc00..056565d575c20 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -431,6 +431,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
+  bool enableMachineSchedDefaultSched() const override { return is64Bit(); }
+
   bool enableEarlyIfConversion() const override;
 
   void getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>>
diff --git a/llvm/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll b/llvm/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
index 0805f7a3704a3..eff72778b67f1 100644
--- a/llvm/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
+++ b/llvm/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
@@ -7,11 +7,11 @@ define i32 @f(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    imull %ecx, %edx
-; CHECK-NEXT:    imull %eax, %ecx
+; CHECK-NEXT:    imull %eax, %edx
+; CHECK-NEXT:    imull %ecx, %ecx
 ; CHECK-NEXT:    imull %eax, %eax
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    leal (%eax,%ecx,2), %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    leal (%eax,%edx,2), %eax
 ; CHECK-NEXT:    retl
   %tmp.2 = mul i32 %a, %a
   %tmp.5 = shl i32 %a, 1
diff --git a/llvm/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll b/llvm/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll
index 88e745491d879..3e13ccc90573a 100644
--- a/llvm/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll
+++ b/llvm/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll
@@ -10,12 +10,13 @@
 define void @test(i32 %A) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andb $16, %cl
-; CHECK-NEXT:    shll %cl, B
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    shrl $3, %eax
 ; CHECK-NEXT:    addl %eax, C
+; CHECK-NEXT:    andb $16, %cl
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, B
 ; CHECK-NEXT:    retl
 	%A.upgrd.1 = trunc i32 %A to i8		; <i8> [#uses=1]
 	%tmp2 = load i32, ptr @B		; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll b/llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll
index 16ef67724f883..66eadf4bb7a82 100644
--- a/llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll
+++ b/llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll
@@ -8,16 +8,16 @@
 define void @test() {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl A, %eax
-; CHECK-NEXT:    movzwl 2(%eax), %eax
 ; CHECK-NEXT:    movzbl B, %ecx
-; CHECK-NEXT:    movl C, %edx
 ; CHECK-NEXT:    andb $16, %cl
-; CHECK-NEXT:    shll %cl, %edx
+; CHECK-NEXT:    movl C, %eax
+; CHECK-NEXT:    shll %cl, %eax
+; CHECK-NEXT:    movl A, %edx
+; CHECK-NEXT:    movzwl 2(%edx), %edx
 ; CHECK-NEXT:    xorb $16, %cl
-; CHECK-NEXT:    shrl %cl, %eax
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    movl %eax, C
+; CHECK-NEXT:    shrl %cl, %edx
+; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    movl %edx, C
 ; CHECK-NEXT:    retl
 	%tmp = load ptr, ptr @A		; <ptr> [#uses=1]
 	%tmp1 = getelementptr i16, ptr %tmp, i32 1		; <ptr> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll b/llvm/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
index f3bdf561a9456..ae7536ba2cf90 100644
--- a/llvm/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
+++ b/llvm/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
@@ -4,9 +4,9 @@
 define i32 @foo(i32 %t, i32 %C) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %cond_true
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/2007-02-16-BranchFold.ll b/llvm/test/CodeGen/X86/2007-02-16-BranchFold.ll
index 206574eeae2ae..754a24e33418f 100644
--- a/llvm/test/CodeGen/X86/2007-02-16-BranchFold.ll
+++ b/llvm/test/CodeGen/X86/2007-02-16-BranchFold.ll
@@ -29,77 +29,69 @@ declare i32 @fprintf(ptr, ptr, ...)
 define i16 @main_bb_2E_i9_2E_i_2E_i932_2E_ce(ptr %l_addr.01.0.i2.i.i929, ptr %tmp66.i62.i.out) {
 ; CHECK-LABEL: main_bb_2E_i9_2E_i_2E_i932_2E_ce:
 ; CHECK:       ## %bb.0: ## %newFuncRoot
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-NEXT:    subl $20, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $24, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %esi, -12
-; CHECK-NEXT:    .cfi_offset %edi, -8
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl (%eax), %edi
-; CHECK-NEXT:    movl 8(%edi), %eax
-; CHECK-NEXT:    movl L_outfile$non_lazy_ptr, %ecx
-; CHECK-NEXT:    movl (%ecx), %ecx
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl L_str1$non_lazy_ptr, %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ecx, (%esp)
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl L_outfile$non_lazy_ptr, %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl (%ecx), %esi
+; CHECK-NEXT:    movl 8(%esi), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl L_str1$non_lazy_ptr, %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    calll _fprintf
-; CHECK-NEXT:    movl 20(%edi), %eax
-; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movl 20(%esi), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    jle LBB0_6
 ; CHECK-NEXT:  ## %bb.1: ## %NodeBlock4
-; CHECK-NEXT:    cmpl $2, %eax
+; CHECK-NEXT:    cmpl $2, %ecx
 ; CHECK-NEXT:    jge LBB0_2
 ; CHECK-NEXT:  ## %bb.4: ## %LeafBlock2
-; CHECK-NEXT:    cmpl $1, %eax
+; CHECK-NEXT:    cmpl $1, %ecx
 ; CHECK-NEXT:    jne LBB0_3
 ; CHECK-NEXT:  ## %bb.5: ## %bb20.i.i937.exitStub
-; CHECK-NEXT:    movl %edi, (%esi)
+; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    movw $3, %ax
-; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    addl $24, %esp
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_6: ## %NodeBlock
 ; CHECK-NEXT:    js LBB0_9
 ; CHECK-NEXT:  ## %bb.7: ## %LeafBlock1
 ; CHECK-NEXT:    jne LBB0_3
 ; CHECK-NEXT:  ## %bb.8: ## %bb12.i.i935.exitStub
-; CHECK-NEXT:    movl %edi, (%esi)
+; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    movw $2, %ax
-; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    addl $24, %esp
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_2: ## %LeafBlock3
 ; CHECK-NEXT:    jne LBB0_3
 ; CHECK-NEXT:  ## %bb.11: ## %bb28.i.i938.exitStub
-; CHECK-NEXT:    movl %edi, (%esi)
+; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    movw $4, %ax
-; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    addl $24, %esp
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_9: ## %LeafBlock
-; CHECK-NEXT:    cmpl $-1, %eax
+; CHECK-NEXT:    cmpl $-1, %ecx
 ; CHECK-NEXT:    je LBB0_10
 ; CHECK-NEXT:  LBB0_3: ## %NewDefault
-; CHECK-NEXT:    movl %edi, (%esi)
+; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    addl $24, %esp
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_10: ## %bb.i14.i.exitStub
-; CHECK-NEXT:    movl %edi, (%esi)
+; CHECK-NEXT:    movl %esi, (%eax)
 ; CHECK-NEXT:    movw $1, %ax
-; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    addl $24, %esp
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 newFuncRoot:
 	br label %bb.i9.i.i932.ce
diff --git a/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
index 49e2bf207e52a..f44a9050b1130 100644
--- a/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/llvm/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@@ -18,8 +18,8 @@ define void @foo(ptr %buf, i32 %size, i32 %col, ptr %p) nounwind {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_2: ## %bb
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl (%esi), %edi
 ; CHECK-NEXT:    movzbl -8(%ecx), %ebx
+; CHECK-NEXT:    movl (%esi), %edi
 ; CHECK-NEXT:    movb %bl, (%edi,%edx)
 ; CHECK-NEXT:    movzbl -7(%ecx), %ebx
 ; CHECK-NEXT:    movb %bl, 7(%edi,%edx)
@@ -37,8 +37,8 @@ define void @foo(ptr %buf, i32 %size, i32 %col, ptr %p) nounwind {
 ; CHECK-NEXT:    movb %bl, 4(%edi,%edx)
 ; CHECK-NEXT:    movzbl (%ecx), %ebx
 ; CHECK-NEXT:    movb %bl, 6(%edi,%edx)
-; CHECK-NEXT:    addl $4, %esi
 ; CHECK-NEXT:    addl $9, %ecx
+; CHECK-NEXT:    addl $4, %esi
 ; CHECK-NEXT:    decl %eax
 ; CHECK-NEXT:    jne LBB0_2
 ; CHECK-NEXT:  LBB0_3: ## %return
diff --git a/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll b/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
index cfb3e508576dd..c76ac6c2b967d 100644
--- a/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
+++ b/llvm/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
@@ -12,12 +12,12 @@ define signext i16 @f(ptr %bp, ptr %ss)   {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %cond_next127
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl (%eax), %edx
-; CHECK-NEXT:    movl (%ecx), %esi
+; CHECK-NEXT:    movl (%ecx), %edx
 ; CHECK-NEXT:    andl $15, %edx
+; CHECK-NEXT:    movl (%eax), %esi
 ; CHECK-NEXT:    andl $15, %esi
-; CHECK-NEXT:    addl %esi, (%ecx)
-; CHECK-NEXT:    cmpl $63, %edx
+; CHECK-NEXT:    addl %edx, (%ecx)
+; CHECK-NEXT:    cmpl $63, %esi
 ; CHECK-NEXT:    jb .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %UnifiedReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll b/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
index d77d4352f8336..1da664bf42fbb 100644
--- a/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
+++ b/llvm/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
@@ -4,32 +4,33 @@
 define fastcc void @fht(ptr %fz, i16 signext  %n) {
 ; CHECK-LABEL: fht:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    subss %xmm3, %xmm1
-; CHECK-NEXT:    movaps %xmm3, %xmm4
-; CHECK-NEXT:    mulss %xmm0, %xmm4
-; CHECK-NEXT:    addss %xmm3, %xmm4
-; CHECK-NEXT:    movaps %xmm3, %xmm2
-; CHECK-NEXT:    subss %xmm4, %xmm2
-; CHECK-NEXT:    addss %xmm3, %xmm4
-; CHECK-NEXT:    xorps %xmm5, %xmm5
-; CHECK-NEXT:    subss %xmm1, %xmm5
-; CHECK-NEXT:    addss %xmm0, %xmm1
-; CHECK-NEXT:    mulss %xmm0, %xmm4
-; CHECK-NEXT:    mulss %xmm0, %xmm5
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    subss %xmm0, %xmm2
+; CHECK-NEXT:    xorps %xmm4, %xmm4
+; CHECK-NEXT:    subss %xmm2, %xmm4
+; CHECK-NEXT:    mulss %xmm1, %xmm4
+; CHECK-NEXT:    movaps %xmm0, %xmm3
+; CHECK-NEXT:    mulss %xmm1, %xmm3
+; CHECK-NEXT:    addss %xmm0, %xmm3
+; CHECK-NEXT:    movaps %xmm0, %xmm5
+; CHECK-NEXT:    addss %xmm3, %xmm5
+; CHECK-NEXT:    mulss %xmm1, %xmm5
 ; CHECK-NEXT:    addss %xmm4, %xmm5
-; CHECK-NEXT:    addss %xmm0, %xmm5
+; CHECK-NEXT:    addss %xmm1, %xmm5
 ; CHECK-NEXT:    movss %xmm5, 0
-; CHECK-NEXT:    movss %xmm3, (%ecx)
-; CHECK-NEXT:    addss %xmm0, %xmm3
-; CHECK-NEXT:    movss %xmm3, 0
-; CHECK-NEXT:    mulss %xmm0, %xmm1
-; CHECK-NEXT:    mulss %xmm0, %xmm2
+; CHECK-NEXT:    movss %xmm0, (%ecx)
+; CHECK-NEXT:    movaps %xmm0, %xmm4
+; CHECK-NEXT:    addss %xmm1, %xmm4
+; CHECK-NEXT:    movss %xmm4, 0
 ; CHECK-NEXT:    addss %xmm1, %xmm2
-; CHECK-NEXT:    addss %xmm0, %xmm2
-; CHECK-NEXT:    movss %xmm2, (%ecx)
+; CHECK-NEXT:    mulss %xmm1, %xmm2
+; CHECK-NEXT:    subss %xmm3, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    addss %xmm2, %xmm0
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%ecx)
 ; CHECK-NEXT:    retl
 entry:
 	br i1 true, label %bb171.preheader, label %bb431
diff --git a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
index 6541693776099..e038c6421b46e 100644
--- a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@@ -20,8 +20,8 @@ define fastcc void @mp_sqrt(i32 %n, i32 %radix, ptr %in, ptr %out, ptr %tmp1, pt
 ; CHECK-NEXT:  .LBB0_1: # %bb.i5
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    addl %ebx, %ebx
 ; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addl %ebx, %ebx
 ; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %mp_unexp_mp2d.exit.i
@@ -42,12 +42,12 @@ define fastcc void @mp_sqrt(i32 %n, i32 %radix, ptr %in, ptr %out, ptr %tmp1, pt
 ; CHECK-NEXT:    cvttsd2si %xmm1, %edi
 ; CHECK-NEXT:    cmpl %edx, %edi
 ; CHECK-NEXT:    cmovgel %eax, %edi
-; CHECK-NEXT:    addl $2, %ecx
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    cvtsi2sd %edi, %xmm2
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    subsd %xmm2, %xmm1
 ; CHECK-NEXT:    mulsd %xmm0, %xmm1
+; CHECK-NEXT:    addl $2, %ecx
 ; CHECK-NEXT:    addl $-2, %ebp
 ; CHECK-NEXT:    jne .LBB0_7
 ; CHECK-NEXT:  # %bb.8: # %mp_unexp_d2mp.exit29.i
diff --git a/llvm/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll b/llvm/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
index 788d2da789bfe..71e700ad52aae 100644
--- a/llvm/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
+++ b/llvm/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
@@ -15,12 +15,12 @@ define void @transpose4x4(ptr %dst, ptr %src, i32 %dst_stride, i32 %src_stride)
 ; CHECK-NEXT:    .cfi_offset %esi, -16
 ; CHECK-NEXT:    .cfi_offset %edi, -12
 ; CHECK-NEXT:    .cfi_offset %ebx, -8
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    leal (%edi,%edi,2), %ebx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    leal (%ecx,%ecx,2), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    leal (%edi,%edi,2), %ebx
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    movd (%esi), %mm0
 ; CHECK-NEXT:    movd (%esi,%edi), %mm1
diff --git a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index b32afdc2214e0..b1b44f20bc44d 100644
--- a/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -28,11 +28,11 @@ define i16 @SQLDriversW(ptr %henv, i16 zeroext  %fDir, ptr %szDrvDesc, i16 signe
 ; CHECK-NEXT:  ## %bb.4: ## %bb37
 ; CHECK-NEXT:    movw $0, 40(%edi)
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    leal (,%ecx,4), %ecx
-; CHECK-NEXT:    leal (,%ebx,4), %edx
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    movzwl %bp, %eax
+; CHECK-NEXT:    leal (,%ecx,4), %ecx
 ; CHECK-NEXT:    movswl %cx, %ecx
+; CHECK-NEXT:    leal (,%ebx,4), %edx
 ; CHECK-NEXT:    movswl %dx, %edx
 ; CHECK-NEXT:    pushl $87
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -64,18 +64,18 @@ define i16 @SQLDriversW(ptr %henv, i16 zeroext  %fDir, ptr %szDrvDesc, i16 signe
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    jmp LBB0_2
 ; CHECK-NEXT:  LBB0_7: ## %bb150
-; CHECK-NEXT:    movswl %si, %eax
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    movswl %cx, %ecx
 ; CHECK-NEXT:    movswl %bx, %edx
-; CHECK-NEXT:    movzwl %bp, %esi
+; CHECK-NEXT:    movzwl %bp, %ebx
+; CHECK-NEXT:    movswl %si, %eax
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $1
diff --git a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll
index bf309f0156684..875a8d683cb80 100644
--- a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll
+++ b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll
@@ -10,10 +10,10 @@ define i16 @f(i64 %x, double %y) {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd %xmm1, atomic
-; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    movsd %xmm1, atomic2
+; CHECK-NEXT:    movsd %xmm0, atomic
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movsd %xmm0, atomic2
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd %xmm0, anything
 ; CHECK-NEXT:    movl ioport, %ecx
 ; CHECK-NEXT:    movl ioport, %eax
diff --git a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-3.ll b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-3.ll
index 79ac6c9b0919c..fd86c8c300445 100644
--- a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-3.ll
+++ b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-3.ll
@@ -7,9 +7,9 @@ target triple = "i386-apple-darwin9.5"
 define i32 @test(ptr %a, ptr %L, ptr %P) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    addl $-2, %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll b/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll
index 54f85b8f73817..8acda559cd852 100644
--- a/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll
+++ b/llvm/test/CodeGen/X86/2008-12-23-crazy-address.ll
@@ -29,18 +29,16 @@ entry:
 define void @bar(i32 %i) nounwind {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $40, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    calll frob at PLT
 ; CHECK-NEXT:    addl $4, %esp
-; CHECK-NEXT:    leal X(%esp,%esi,4), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    leal X(%esp,%eax,4), %eax
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    calll borf at PLT
 ; CHECK-NEXT:    addl $44, %esp
-; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 entry:
 	%Y = alloca [10 x i32]
diff --git a/llvm/test/CodeGen/X86/2009-01-31-BigShift.ll b/llvm/test/CodeGen/X86/2009-01-31-BigShift.ll
index 3a151e898048a..8685ac17aa671 100644
--- a/llvm/test/CodeGen/X86/2009-01-31-BigShift.ll
+++ b/llvm/test/CodeGen/X86/2009-01-31-BigShift.ll
@@ -5,17 +5,32 @@
 define void @x(i288 %i) nounwind {
 ; CHECK-LABEL: x:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    calll add at PLT
 ; CHECK-NEXT:    addl $36, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 	call void @add(i288 %i)
 	ret void
diff --git a/llvm/test/CodeGen/X86/2009-01-31-BigShift2.ll b/llvm/test/CodeGen/X86/2009-01-31-BigShift2.ll
index 38240e8c1e2bd..5471a4ed96167 100644
--- a/llvm/test/CodeGen/X86/2009-01-31-BigShift2.ll
+++ b/llvm/test/CodeGen/X86/2009-01-31-BigShift2.ll
@@ -6,11 +6,11 @@ define void @test(ptr %P, ptr %Q) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 56(%ecx), %edx
-; CHECK-NEXT:    movl 60(%ecx), %ecx
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    movl %edx, (%eax)
+; CHECK-NEXT:    movl 56(%eax), %ecx
+; CHECK-NEXT:    movl 60(%eax), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %eax, 4(%edx)
+; CHECK-NEXT:    movl %ecx, (%edx)
 ; CHECK-NEXT:    retl
 	%A = load <8 x double>, ptr %P		; <<8 x double>> [#uses=1]
 	%B = bitcast <8 x double> %A to i512		; <i512> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
index 8fc701c8cf694..30fa8d32ecbc5 100644
--- a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
+++ b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
@@ -9,10 +9,10 @@ define void @cpuid(ptr %data) nounwind {
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl 32(%esi), %edx
+; CHECK-NEXT:    movl 28(%esi), %ecx
 ; CHECK-NEXT:    movl 20(%esi), %eax
 ; CHECK-NEXT:    movl 24(%esi), %ebx
-; CHECK-NEXT:    movl 28(%esi), %ecx
-; CHECK-NEXT:    movl 32(%esi), %edx
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    cpuid
 ; CHECK-NEXT:    ## InlineAsm End
diff --git a/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll b/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
index 695a2d0cd806e..caf7ed47aeb8c 100644
--- a/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
+++ b/llvm/test/CodeGen/X86/2009-06-05-VariableIndexInsert.ll
@@ -9,10 +9,10 @@ define <2 x i64> @_mm_insert_epi16(<2 x i64> %a, i32 %b, i32 %imm) nounwind read
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movaps %xmm0, (%esp)
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movzwl 8(%ebp), %ecx
 ; X86-NEXT:    andl $7, %eax
-; X86-NEXT:    movaps %xmm0, (%esp)
+; X86-NEXT:    movzwl 8(%ebp), %ecx
 ; X86-NEXT:    movw %cx, (%esp,%eax,2)
 ; X86-NEXT:    movaps (%esp), %xmm0
 ; X86-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll b/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll
index 2504a6aa8193a..1a624444ae580 100644
--- a/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll
+++ b/llvm/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -8,9 +8,9 @@ define float @test(ptr %A) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps (%eax), %xmm0
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; X86-NEXT:    xorps %xmm1, %xmm1
 ; X86-NEXT:    movaps %xmm1, (%eax)
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index 5a4cbac57eeeb..3906a669f9314 100644
--- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -69,10 +69,10 @@ define void @full_test() {
 ; X86-NEXT:    subl $60, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 64
 ; X86-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; X86-NEXT:    cvttps2dq %xmm2, %xmm0
-; X86-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    cmpltps %xmm2, %xmm0
+; X86-NEXT:    cvttps2dq %xmm2, %xmm1
+; X86-NEXT:    cvtdq2ps %xmm1, %xmm1
 ; X86-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,u,u]
 ; X86-NEXT:    addps %xmm1, %xmm3
 ; X86-NEXT:    movaps %xmm1, %xmm4
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index f2b4c49b1dbcd..94ef86d1c8c02 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -93,8 +93,8 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    ## Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    ## Parent Loop BB0_13 Depth=2
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    addl $4, %edx
+; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    decl %ebx
 ; CHECK-NEXT:    jne LBB0_16
 ; CHECK-NEXT:  LBB0_17: ## %bb57
diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 65273870c3dfb..283fb6e1cca55 100644
--- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -6,27 +6,27 @@ define void @endless_loop() {
 ; AVX1-LABEL: endless_loop:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vmovaps (%eax), %xmm0
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX1-NEXT:    vmovaps %ymm0, (%eax)
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    vmovaps %ymm1, (%eax)
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX1-NEXT:    vmovaps %ymm0, (%eax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retl
 ;
 ; AVX2-LABEL: endless_loop:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vbroadcastss (%eax), %xmm0
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm1
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX2-NEXT:    vmovaps %ymm0, (%eax)
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-NEXT:    vmovaps %ymm1, (%eax)
+; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    vmovaps %ymm0, (%eax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
index 6db17251cd599..47d01203d55e1 100644
--- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -28,8 +28,8 @@ entry:
 define void @store_64(ptr %ptr) {
 ; X86-LABEL: store_64:
 ; X86:       # %bb.0: # %BB
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll
index 2b692bff0461e..f14209c1a642c 100644
--- a/llvm/test/CodeGen/X86/3addr-16bit.ll
+++ b/llvm/test/CodeGen/X86/3addr-16bit.ll
@@ -29,9 +29,8 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    incl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal 1(%ecx), %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB0_2
 ; X86-NEXT:  ## %bb.1: ## %bb
@@ -82,9 +81,8 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    decl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal -1(%ecx), %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB1_2
 ; X86-NEXT:  ## %bb.1: ## %bb
@@ -137,9 +135,8 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl $2, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal 2(%ecx), %eax
 ; X86-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    jne LBB2_2
 ; X86-NEXT:  ## %bb.1: ## %bb
@@ -191,9 +188,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    leal (%edx,%ecx), %eax
 ; X86-NEXT:    cmpw %cx, %dx
 ; X86-NEXT:    jne LBB3_2
 ; X86-NEXT:  ## %bb.1: ## %bb
diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
index 4c92adb25d0bd..77443ea16b46a 100644
--- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
+++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
@@ -24,10 +24,10 @@ define i64 @t0(i64 %val, i64 %shamt) nounwind {
 ; X86-NOBMI2-LABEL: t0:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $32, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -42,10 +42,10 @@ define i64 @t0(i64 %val, i64 %shamt) nounwind {
 ;
 ; X86-BMI2-LABEL: t0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $32, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -80,10 +80,10 @@ define i64 @n1(i64 %val, i64 %shamt) nounwind {
 ; X86-NOBMI2-LABEL: n1:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $33, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -98,10 +98,10 @@ define i64 @n1(i64 %val, i64 %shamt) nounwind {
 ;
 ; X86-BMI2-LABEL: n1:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $33, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -134,10 +134,10 @@ define i64 @n2(i64 %val, i64 %shamt) nounwind {
 ; X86-NOBMI2-LABEL: n2:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $31, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -152,10 +152,10 @@ define i64 @n2(i64 %val, i64 %shamt) nounwind {
 ;
 ; X86-BMI2-LABEL: n2:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $31, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -189,10 +189,10 @@ define i64 @t3(i64 %val, i64 %shamt) nounwind {
 ; X86-NOBMI2-LABEL: t3:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $64, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -207,10 +207,10 @@ define i64 @t3(i64 %val, i64 %shamt) nounwind {
 ;
 ; X86-BMI2-LABEL: t3:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -245,10 +245,10 @@ define i64 @t4(i64 %val, i64 %shamt) nounwind {
 ; X86-NOBMI2-LABEL: t4:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $96, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -263,10 +263,10 @@ define i64 @t4(i64 %val, i64 %shamt) nounwind {
 ;
 ; X86-BMI2-LABEL: t4:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $96, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -302,21 +302,19 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ;
 ; X86-NOBMI2-LABEL: t5_cse:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %ebx
-; X86-NOBMI2-NEXT:    pushl %edi
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI2-NEXT:    movl %eax, %ebx
-; X86-NOBMI2-NEXT:    addl $32, %ebx
-; X86-NOBMI2-NEXT:    adcl $0, %edi
-; X86-NOBMI2-NEXT:    movl %ebx, (%ecx)
-; X86-NOBMI2-NEXT:    movl %edi, 4(%ecx)
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI2-NEXT:    movl %eax, %esi
+; X86-NOBMI2-NEXT:    addl $32, %esi
+; X86-NOBMI2-NEXT:    movl %esi, (%ecx)
+; X86-NOBMI2-NEXT:    adcl $0, %edx
+; X86-NOBMI2-NEXT:    movl %edx, 4(%ecx)
 ; X86-NOBMI2-NEXT:    movb $32, %cl
 ; X86-NOBMI2-NEXT:    subb %al, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -327,27 +325,23 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB5_2:
 ; X86-NOBMI2-NEXT:    popl %esi
-; X86-NOBMI2-NEXT:    popl %edi
-; X86-NOBMI2-NEXT:    popl %ebx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: t5_cse:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI2-NEXT:    addl $32, %edi
-; X86-BMI2-NEXT:    adcl $0, %esi
-; X86-BMI2-NEXT:    movl %edi, (%ecx)
-; X86-BMI2-NEXT:    movl %esi, 4(%ecx)
+; X86-BMI2-NEXT:    movl %eax, %esi
+; X86-BMI2-NEXT:    addl $32, %esi
+; X86-BMI2-NEXT:    movl %esi, (%ecx)
+; X86-BMI2-NEXT:    adcl $0, %edx
+; X86-BMI2-NEXT:    movl %edx, 4(%ecx)
 ; X86-BMI2-NEXT:    movb $32, %cl
-; X86-BMI2-NEXT:    subb %bl, %cl
+; X86-BMI2-NEXT:    subb %al, %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -357,8 +351,6 @@ define i64 @t5_cse(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB5_2:
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %edi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
   %incshamt = add i64 %shamt, 32
   store i64 %incshamt, ptr %dst
@@ -388,17 +380,16 @@ define i64 @t6_cse2(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ;
 ; X86-NOBMI2-LABEL: t6_cse2:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %edi
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    xorl %edi, %edi
 ; X86-NOBMI2-NEXT:    movl $32, %ecx
 ; X86-NOBMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI2-NEXT:    movl %ecx, (%eax)
-; X86-NOBMI2-NEXT:    movl %edi, 4(%eax)
+; X86-NOBMI2-NEXT:    movl %ecx, (%edx)
+; X86-NOBMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movl %eax, 4(%edx)
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -409,22 +400,19 @@ define i64 @t6_cse2(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB6_2:
 ; X86-NOBMI2-NEXT:    popl %esi
-; X86-NOBMI2-NEXT:    popl %edi
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: t6_cse2:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %edi
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:    movl $32, %ecx
 ; X86-BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-BMI2-NEXT:    movl %ecx, (%esi)
-; X86-BMI2-NEXT:    movl %edi, 4(%esi)
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
+; X86-BMI2-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %eax, 4(%edx)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -433,8 +421,6 @@ define i64 @t6_cse2(i64 %val, i64 %shamt, ptr%dst) nounwind {
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB6_2:
-; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    retl
   %negshamt = sub i64 32, %shamt
   store i64 %negshamt, ptr %dst
diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
index 1a8d33f5b3480..7d1d74aef1f3c 100644
--- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
+++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -11,10 +11,10 @@
 define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: t0:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addl %eax, %ecx
+; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB0_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
@@ -29,17 +29,17 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I386-CMOV-NEXT:    addl %eax, %ecx
 ; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    cmpb %al, %cl
-; I386-CMOV-NEXT:    cmovgl %ecx, %eax
+; I386-CMOV-NEXT:    cmpb %cl, %al
+; I386-CMOV-NEXT:    cmovlel %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: t0:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addl %eax, %ecx
+; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB0_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
@@ -54,8 +54,8 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I686-CMOV-NEXT:    addl %eax, %ecx
 ; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    cmpb %al, %cl
-; I686-CMOV-NEXT:    cmovgl %ecx, %eax
+; I686-CMOV-NEXT:    cmpb %cl, %al
+; I686-CMOV-NEXT:    cmovlel %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
@@ -84,10 +84,10 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: neg_only_one_truncation:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addb %al, %cl
+; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB1_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
@@ -99,21 +99,21 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou
 ; I386-CMOV-LABEL: neg_only_one_truncation:
 ; I386-CMOV:       # %bb.0:
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
-; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
-; I386-CMOV-NEXT:    cmpb %al, %cl
-; I386-CMOV-NEXT:    movzbl %al, %eax
-; I386-CMOV-NEXT:    cmovgl %ecx, %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
+; I386-CMOV-NEXT:    movzbl %cl, %edx
+; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpb %cl, %al
+; I386-CMOV-NEXT:    cmovlel %edx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: neg_only_one_truncation:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addb %al, %cl
+; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB1_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
@@ -125,12 +125,12 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou
 ; I686-CMOV-LABEL: neg_only_one_truncation:
 ; I686-CMOV:       # %bb.0:
 ; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
-; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
-; I686-CMOV-NEXT:    cmpb %al, %cl
-; I686-CMOV-NEXT:    movzbl %al, %eax
-; I686-CMOV-NEXT:    cmovgl %ecx, %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
+; I686-CMOV-NEXT:    movzbl %cl, %edx
+; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    cmpb %cl, %al
+; I686-CMOV-NEXT:    cmovlel %edx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
@@ -174,11 +174,11 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
 ; I386-CMOV-LABEL: neg_type_mismatch:
 ; I386-CMOV:       # %bb.0:
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
-; I386-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
-; I386-CMOV-NEXT:    cmpb %al, %cl
-; I386-CMOV-NEXT:    cmovgl %ecx, %eax
+; I386-CMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addw %ax, %cx
+; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpb %cl, %al
+; I386-CMOV-NEXT:    cmovlel %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
@@ -199,11 +199,11 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
 ; I686-CMOV-LABEL: neg_type_mismatch:
 ; I686-CMOV:       # %bb.0:
 ; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
-; I686-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
-; I686-CMOV-NEXT:    cmpb %al, %cl
-; I686-CMOV-NEXT:    cmovgl %ecx, %eax
+; I686-CMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addw %ax, %cx
+; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    cmpb %cl, %al
+; I686-CMOV-NEXT:    cmovlel %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
@@ -233,9 +233,9 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
 define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: negative_CopyFromReg:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB3_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
@@ -255,9 +255,9 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ;
 ; I686-NOCMOV-LABEL: negative_CopyFromReg:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB3_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
index 261e8f873b2fe..9a0a959599d12 100644
--- a/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
+++ b/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
@@ -50,16 +50,14 @@ define cc 11 i32 @caller(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind {
 define x86_regcallcc {i32, i32, i32} @test_callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind {
 ; CHECK-LABEL: test_callee:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leal (,%esi,8), %ecx
-; CHECK-NEXT:    subl %esi, %ecx
 ; CHECK-NEXT:    movl $5, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %esi
-; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    leal (,%esi,8), %eax
+; CHECK-NEXT:    subl %esi, %eax
 ; CHECK-NEXT:    leal (,%edi,8), %edx
 ; CHECK-NEXT:    subl %edi, %edx
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    retl
   %b1 = mul i32 7, %e0
   %b2 = udiv i32 5, %e0
diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
index 0103d2bf3cc2c..bc29b537bdfc9 100644
--- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -20,8 +20,8 @@ define void @merge_const_store(i32 %count, ptr nocapture %p) nounwind uwtable no
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_2: # %.lr.ph
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-NEXT:    movl $134678021, 4(%ecx) # imm = 0x8070605
+; X86-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-NEXT:    addl $8, %ecx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    jne .LBB0_2
@@ -84,14 +84,14 @@ define void @merge_const_store_no_vec(i32 %count, ptr nocapture %p) noimplicitfl
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB1_2: # %.lr.ph
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl $0, (%ecx)
-; X86-NEXT:    movl $0, 4(%ecx)
-; X86-NEXT:    movl $0, 8(%ecx)
-; X86-NEXT:    movl $0, 12(%ecx)
-; X86-NEXT:    movl $0, 16(%ecx)
-; X86-NEXT:    movl $0, 20(%ecx)
-; X86-NEXT:    movl $0, 24(%ecx)
 ; X86-NEXT:    movl $0, 28(%ecx)
+; X86-NEXT:    movl $0, 24(%ecx)
+; X86-NEXT:    movl $0, 20(%ecx)
+; X86-NEXT:    movl $0, 16(%ecx)
+; X86-NEXT:    movl $0, 12(%ecx)
+; X86-NEXT:    movl $0, 8(%ecx)
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $0, (%ecx)
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    jne .LBB1_2
@@ -222,10 +222,10 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, ptr nocapture %p) nounwind
 ; X86-BWON-NEXT:    .p2align 4
 ; X86-BWON-NEXT:  .LBB3_2: # %.lr.ph
 ; X86-BWON-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-BWON-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-BWON-NEXT:    movb %dl, 4(%ecx)
-; X86-BWON-NEXT:    movw $1798, 5(%ecx) # imm = 0x706
 ; X86-BWON-NEXT:    movb $8, 7(%ecx)
+; X86-BWON-NEXT:    movw $1798, 5(%ecx) # imm = 0x706
+; X86-BWON-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-BWON-NEXT:    addl $8, %ecx
 ; X86-BWON-NEXT:    decl %eax
 ; X86-BWON-NEXT:    jne .LBB3_2
@@ -243,10 +243,10 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, ptr nocapture %p) nounwind
 ; X86-BWOFF-NEXT:    .p2align 4
 ; X86-BWOFF-NEXT:  .LBB3_2: # %.lr.ph
 ; X86-BWOFF-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-BWOFF-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-BWOFF-NEXT:    movb %dl, 4(%ecx)
-; X86-BWOFF-NEXT:    movw $1798, 5(%ecx) # imm = 0x706
 ; X86-BWOFF-NEXT:    movb $8, 7(%ecx)
+; X86-BWOFF-NEXT:    movw $1798, 5(%ecx) # imm = 0x706
+; X86-BWOFF-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-BWOFF-NEXT:    addl $8, %ecx
 ; X86-BWOFF-NEXT:    decl %eax
 ; X86-BWOFF-NEXT:    jne .LBB3_2
@@ -530,8 +530,8 @@ define void @merge_loads_integer(i32 %count, ptr noalias nocapture %q, ptr noali
 ; X86-NEXT:  .LBB6_2: # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl (%edx), %esi
 ; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %edi, 4(%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    jne .LBB6_2
@@ -872,9 +872,9 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(ptr %a, ptr %b, ptr %c, i6
 ; X86-BWON-NEXT:    movl %eax, %edx
 ; X86-BWON-NEXT:    orl $1, %edx
 ; X86-BWON-NEXT:    movb %cl, (%ebx,%edx)
-; X86-BWON-NEXT:    incl %edi
 ; X86-BWON-NEXT:    addl $2, %eax
 ; X86-BWON-NEXT:    adcl $0, %ebp
+; X86-BWON-NEXT:    incl %edi
 ; X86-BWON-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-BWON-NEXT:    movl %ebp, %ecx
 ; X86-BWON-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
@@ -918,9 +918,9 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(ptr %a, ptr %b, ptr %c, i6
 ; X86-BWOFF-NEXT:    movl %eax, %edx
 ; X86-BWOFF-NEXT:    orl $1, %edx
 ; X86-BWOFF-NEXT:    movb %cl, (%ebx,%edx)
-; X86-BWOFF-NEXT:    incl %edi
 ; X86-BWOFF-NEXT:    addl $2, %eax
 ; X86-BWOFF-NEXT:    adcl $0, %ebp
+; X86-BWOFF-NEXT:    incl %edi
 ; X86-BWOFF-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-BWOFF-NEXT:    movl %ebp, %ecx
 ; X86-BWOFF-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
@@ -1145,8 +1145,8 @@ define void @loadStoreBaseIndexOffsetSextNoSex(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; X86-BWON-NEXT:    incb %bl
 ; X86-BWON-NEXT:    movsbl %bl, %ebx
 ; X86-BWON-NEXT:    movb (%edx,%ebx), %ch
-; X86-BWON-NEXT:    movb %cl, (%esi,%eax,2)
 ; X86-BWON-NEXT:    movb %ch, 1(%esi,%eax,2)
+; X86-BWON-NEXT:    movb %cl, (%esi,%eax,2)
 ; X86-BWON-NEXT:    incl %eax
 ; X86-BWON-NEXT:    cmpl %eax, %ebp
 ; X86-BWON-NEXT:    jne .LBB12_1
@@ -1187,8 +1187,8 @@ define void @loadStoreBaseIndexOffsetSextNoSex(ptr %a, ptr %b, ptr %c, i32 %n) {
 ; X86-BWOFF-NEXT:    incb %bl
 ; X86-BWOFF-NEXT:    movsbl %bl, %ebx
 ; X86-BWOFF-NEXT:    movb (%edx,%ebx), %ch
-; X86-BWOFF-NEXT:    movb %cl, (%esi,%eax,2)
 ; X86-BWOFF-NEXT:    movb %ch, 1(%esi,%eax,2)
+; X86-BWOFF-NEXT:    movb %cl, (%esi,%eax,2)
 ; X86-BWOFF-NEXT:    incl %eax
 ; X86-BWOFF-NEXT:    cmpl %eax, %ebp
 ; X86-BWOFF-NEXT:    jne .LBB12_1
@@ -1314,8 +1314,8 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, ptr %ptr
 ; X86-LABEL: merge_vec_extract_stores:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vmovups %ymm0, 48(%eax)
 ; X86-NEXT:    vmovups %ymm1, 80(%eax)
+; X86-NEXT:    vmovups %ymm0, 48(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -1346,8 +1346,8 @@ define void @merge_vec_stores_from_loads(ptr %v, ptr %ptr) {
 ; X86-LABEL: merge_vec_stores_from_loads:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovups (%ecx), %ymm0
+; X86-NEXT:    vmovups (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -1373,8 +1373,8 @@ define void @merge_vec_stores_from_loads(ptr %v, ptr %ptr) {
 define void @merge_vec_stores_of_zero(ptr %ptr) {
 ; X86-LABEL: merge_vec_stores_of_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %ymm0, 48(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -1395,10 +1395,10 @@ define void @merge_vec_stores_of_zero(ptr %ptr) {
 define void @merge_vec_stores_of_constant_splat(ptr %ptr) {
 ; X86-LABEL: merge_vec_stores_of_constant_splat:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vmovaps %xmm0, 48(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, 64(%eax)
+; X86-NEXT:    vmovaps %xmm0, 48(%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: merge_vec_stores_of_constant_splat:
@@ -1417,11 +1417,11 @@ define void @merge_vec_stores_of_constant_splat(ptr %ptr) {
 define void @merge_vec_stores_of_constants(ptr %ptr) {
 ; X86-LABEL: merge_vec_stores_of_constants:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} xmm0 = [0,265,26,0]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps %xmm0, 64(%eax)
 ; X86-NEXT:    vmovaps {{.*#+}} xmm0 = [25,51,45,0]
 ; X86-NEXT:    vmovaps %xmm0, 48(%eax)
-; X86-NEXT:    vmovaps {{.*#+}} xmm0 = [0,265,26,0]
-; X86-NEXT:    vmovaps %xmm0, 64(%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: merge_vec_stores_of_constants:
@@ -1441,8 +1441,8 @@ define void @merge_vec_stores_of_constants(ptr %ptr) {
 define void @merge_vec_stores_of_constants_with_undefs(ptr %ptr) {
 ; X86-LABEL: merge_vec_stores_of_constants_with_undefs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %ymm0, 48(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -1500,8 +1500,8 @@ define void @almost_consecutive_stores(ptr %p) {
 ; X86-LABEL: almost_consecutive_stores:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb $0, (%eax)
 ; X86-NEXT:    movb $1, 42(%eax)
+; X86-NEXT:    movb $0, (%eax)
 ; X86-NEXT:    movw $770, 2(%eax) # imm = 0x302
 ; X86-NEXT:    retl
 ;
@@ -1564,8 +1564,8 @@ define void @merge_const_store_heterogeneous(i32 %count, ptr nocapture %p) nounw
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB23_2: # %.lr.ph
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-NEXT:    movl $134678021, 4(%ecx) # imm = 0x8070605
+; X86-NEXT:    movl $67305985, (%ecx) # imm = 0x4030201
 ; X86-NEXT:    addl $24, %ecx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    jne .LBB23_2
@@ -1615,11 +1615,11 @@ define void @merge_heterogeneous(ptr nocapture %p, ptr nocapture %q) {
 ; X86-LABEL: merge_heterogeneous:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: merge_heterogeneous:
@@ -1676,9 +1676,9 @@ entry:
 define i32 @merge_store_alias(ptr %buff, ptr %other) {
 ; X86-LABEL: merge_store_alias:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/SwitchLowering.ll b/llvm/test/CodeGen/X86/SwitchLowering.ll
index 921b4e3d24d11..0a4d2f8bdc2ee 100644
--- a/llvm/test/CodeGen/X86/SwitchLowering.ll
+++ b/llvm/test/CodeGen/X86/SwitchLowering.ll
@@ -5,37 +5,32 @@
 define ptr @FindChar(ptr %CurPtr) {
 ; CHECK-LABEL: FindChar:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-NEXT:    .cfi_offset %esi, -12
-; CHECK-NEXT:    .cfi_offset %edi, -8
-; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %bb
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movzbl (%esi,%edi), %eax
-; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    cmpl $120, %eax
+; CHECK-NEXT:    movzbl (%esi,%eax), %ecx
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    cmpl $120, %ecx
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %bb
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:  .LBB0_3: # %bb7
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movzbl %cl, %ecx
+; CHECK-NEXT:    addl %eax, %esi
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT:    calll foo at PLT
 ; CHECK-NEXT:    addl $4, %esp
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
-; CHECK-NEXT:    addl %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/WidenArith.ll b/llvm/test/CodeGen/X86/WidenArith.ll
index 194f614d084bc..56d9015dce182 100644
--- a/llvm/test/CodeGen/X86/WidenArith.ll
+++ b/llvm/test/CodeGen/X86/WidenArith.ll
@@ -8,9 +8,9 @@ define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
 ; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm2
 ; X86-NEXT:    vmulps %ymm0, %ymm1, %ymm1
 ; X86-NEXT:    vsubps %ymm2, %ymm1, %ymm3
+; X86-NEXT:    vcmpltps %ymm3, %ymm2, %ymm2
 ; X86-NEXT:    vcmpltps %ymm1, %ymm0, %ymm0
-; X86-NEXT:    vcmpltps %ymm3, %ymm2, %ymm1
-; X86-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; X86-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index d9064c684cb20..ee670ae657a23 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -373,42 +373,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl 52(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    subl 24(%ebp), %ecx
-; X86-NEXT:    sbbl 28(%ebp), %eax
-; X86-NEXT:    sbbl 32(%ebp), %edx
-; X86-NEXT:    sbbl 36(%ebp), %esi
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    subl 24(%ebp), %edx
+; X86-NEXT:    sbbl 28(%ebp), %ecx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    cmovll %ebx, %esi
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %ebx, 4(%edx)
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl %edi, 12(%edx)
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -449,42 +448,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl 52(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    subl 24(%ebp), %ecx
-; X86-NEXT:    sbbl 28(%ebp), %eax
-; X86-NEXT:    sbbl 32(%ebp), %edx
-; X86-NEXT:    sbbl 36(%ebp), %esi
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    subl 24(%ebp), %edx
+; X86-NEXT:    sbbl 28(%ebp), %ecx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    cmovll %ebx, %esi
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %ebx, 4(%edx)
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl %edi, 12(%edx)
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -608,24 +606,24 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    cmovll %edi, %eax
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
-; X86-NEXT:    cmovll %ebx, %ecx
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    cmovll %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    cmovll %ecx, %ebx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    cmovll %esi, %edx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -655,52 +653,55 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    sbbl 44(%ebp), %eax
-; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmpl %ebx, %edx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl 32(%ebp), %eax
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    cmovll 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    cmovll 28(%ebp), %eax
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    cmovll %edi, %ecx
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    sbbl 28(%ebp), %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl 32(%ebp), %edi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl 36(%ebp), %edi
-; X86-NEXT:    cmovll 36(%ebp), %ebx
-; X86-NEXT:    cmovll 32(%ebp), %edx
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    cmovll 28(%ebp), %edi
-; X86-NEXT:    cmovll 24(%ebp), %esi
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl %edi, 8(%edx)
-; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll %ebx, %eax
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    cmovll %ecx, %edi
+; X86-NEXT:    cmovll 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ebx
+; X86-NEXT:    cmovll 28(%ebp), %ebx
+; X86-NEXT:    cmovll 24(%ebp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sbbl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -877,25 +878,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 48(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 52(%ebp), %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    subl 40(%ebp), %ecx
 ; X86-NEXT:    sbbl 44(%ebp), %edx
-; X86-NEXT:    sbbl 48(%ebp), %esi
-; X86-NEXT:    sbbl 52(%ebp), %ebx
-; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl 48(%ebp), %ebx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    cmovgel %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    cmovgel %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%edi)
 ; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %edi, %esi
-; X86-NEXT:    cmovgel %eax, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1142,32 +1144,32 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %edx
-; X86-NEXT:    sbbl 48(%ebp), %ecx
-; X86-NEXT:    sbbl 52(%ebp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %eax
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    subl %edi, %ebx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %eax
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1203,32 +1205,32 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %edx
-; X86-NEXT:    sbbl 48(%ebp), %ecx
-; X86-NEXT:    sbbl 52(%ebp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %eax
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    subl %edi, %ebx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %eax
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index a1a4ba81ae493..80a7eec1aa192 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -358,25 +358,26 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    subl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl 28(%ebp), %edx
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    sbbl 36(%ebp), %ebx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl 36(%ebp), %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%edi)
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -423,25 +424,26 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    subl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl 28(%ebp), %edx
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    sbbl 36(%ebp), %ebx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl 36(%ebp), %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%edi)
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -608,25 +610,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    subl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl 28(%ebp), %edx
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    sbbl 36(%ebp), %ebx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl 36(%ebp), %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%edi)
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -795,25 +798,26 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    subl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl 28(%ebp), %edx
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    sbbl 36(%ebp), %ebx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl 36(%ebp), %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%edi)
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1045,9 +1049,11 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl 24(%ebp), %edi
@@ -1056,24 +1062,25 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    sbbl 44(%ebp), %esi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    subl %ebx, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -1100,9 +1107,11 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl 24(%ebp), %edi
@@ -1111,24 +1120,25 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    sbbl 44(%ebp), %esi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    subl %ebx, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -1319,25 +1329,26 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    subl 24(%ebp), %ecx
 ; X86-NEXT:    sbbl 28(%ebp), %edx
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    sbbl 36(%ebp), %ebx
-; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl 36(%ebp), %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%edi)
 ; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %edi, %esi
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index cebb42751dfab..98bf21110ac92 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -9,8 +9,8 @@
 define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_ext_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorb %al, %cl
@@ -72,8 +72,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_ext_i8_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorb %al, %cl
@@ -102,8 +102,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_ext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorl %eax, %ecx
@@ -163,8 +163,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_ext_i16_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorl %eax, %ecx
@@ -354,36 +354,36 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
 ; X86-NEXT:    sbbl 44(%ebp), %edi
-; X86-NEXT:    sbbl 48(%ebp), %esi
-; X86-NEXT:    sbbl 52(%ebp), %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %eax
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    subl %ebx, %ecx
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
 ; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -426,36 +426,36 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
 ; X86-NEXT:    sbbl 44(%ebp), %edi
-; X86-NEXT:    sbbl 48(%ebp), %esi
-; X86-NEXT:    sbbl 52(%ebp), %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %eax
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    subl %ebx, %ecx
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
 ; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -495,8 +495,8 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_minmax_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorb %al, %cl
@@ -522,8 +522,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_minmax_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorl %eax, %ecx
@@ -577,24 +577,24 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    cmovbl %edi, %eax
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
-; X86-NEXT:    cmovbl %ebx, %ecx
-; X86-NEXT:    cmovbl %edi, %esi
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    cmovbl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    cmovbl %ecx, %ebx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    cmovbl %esi, %edx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -624,52 +624,55 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    sbbl 44(%ebp), %eax
-; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmpl %ebx, %edx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl 32(%ebp), %eax
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    cmovbl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    cmovbl 28(%ebp), %eax
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    cmovbl %edi, %ecx
-; X86-NEXT:    cmpl %edi, %esi
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    sbbl 28(%ebp), %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl 32(%ebp), %edi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl 36(%ebp), %edi
-; X86-NEXT:    cmovbl 36(%ebp), %ebx
-; X86-NEXT:    cmovbl 32(%ebp), %edx
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    cmovbl 28(%ebp), %edi
-; X86-NEXT:    cmovbl 24(%ebp), %esi
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl %edi, 8(%edx)
-; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl %ebx, %eax
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    cmovbl %ecx, %edi
+; X86-NEXT:    cmovbl 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %ebx
+; X86-NEXT:    cmovbl 24(%ebp), %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sbbl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -708,8 +711,8 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_cmp_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorb %al, %cl
@@ -736,8 +739,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_cmp_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    xorl %eax, %ecx
@@ -830,36 +833,36 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
 ; X86-NEXT:    sbbl 44(%ebp), %edi
-; X86-NEXT:    sbbl 48(%ebp), %esi
-; X86-NEXT:    sbbl 52(%ebp), %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %eax
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    subl %ebx, %ecx
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
 ; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index b8bc3649773f2..ae5536782e4c1 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -9,8 +9,8 @@
 define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_ext_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorb %cl, %al
@@ -68,8 +68,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_ext_i8_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorb %cl, %al
@@ -97,8 +97,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_ext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -156,8 +156,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_ext_i16_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -327,35 +327,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
+; X86-NEXT:    sbbl 44(%ebp), %edi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    subl %esi, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -385,35 +388,38 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
+; X86-NEXT:    sbbl 44(%ebp), %edi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    subl %esi, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -445,8 +451,8 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_minmax_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorb %cl, %al
@@ -472,8 +478,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_minmax_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -554,35 +560,38 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
+; X86-NEXT:    sbbl 44(%ebp), %edi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    subl %esi, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -612,8 +621,8 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_cmp_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorb %cl, %al
@@ -640,8 +649,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_cmp_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -725,35 +734,38 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
+; X86-NEXT:    sbbl 44(%ebp), %edi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    subl %esi, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -784,8 +796,8 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
 ; X86-LABEL: abd_select_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorb %cl, %al
@@ -812,8 +824,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
 define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
 ; X86-LABEL: abd_select_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -897,35 +909,38 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    subl 40(%ebp), %edi
-; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl 40(%ebp), %ebx
+; X86-NEXT:    sbbl 44(%ebp), %edi
 ; X86-NEXT:    sbbl 48(%ebp), %edx
 ; X86-NEXT:    sbbl 52(%ebp), %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    subl %esi, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index e252d5953e60e..4b8cb3ea95908 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -146,31 +146,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    subl %edx, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
@@ -213,10 +216,10 @@ define <2 x i32> @test_v2i32(<2 x i32> %a) nounwind {
 ; X86-LABEL: test_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    negl %edx
 ; X86-NEXT:    cmovsl %ecx, %edx
@@ -242,15 +245,15 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) nounwind {
 ; X86-LABEL: test_v3i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovsl %edx, %eax
+; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    negl %edx
 ; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    cmovsl %esi, %ecx
@@ -276,33 +279,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) nounwind {
 ;
 ; X86-LABEL: test_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    cmovsl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    negl %edx
-; X86-NEXT:    cmovsl %ebx, %edx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    cmovsl %edi, %ebx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    cmovsl %esi, %edi
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    cmovsl %ecx, %esi
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    cmovsl %ecx, %edx
 ; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
   %r = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false)
   ret <4 x i32> %r
@@ -336,62 +333,47 @@ define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind {
 ;
 ; X86-LABEL: test_v8i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    cmovsl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    cmovsl %esi, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    cmovsl %edi, %esi
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    cmovsl %ebp, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    cmovsl %ebx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    cmovsl %eax, %ebx
+; X86-NEXT:    cmovsl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    cmovsl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    cmovsl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, 28(%edx)
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl %ebx, 20(%edx)
-; X86-NEXT:    movl %ebp, 16(%edx)
-; X86-NEXT:    movl %edi, 12(%edx)
-; X86-NEXT:    movl %esi, 8(%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovsl %ecx, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 false)
   ret <8 x i32> %r
@@ -412,62 +394,47 @@ define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind {
 ;
 ; X86-LABEL: test_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negw %cx
-; X86-NEXT:    cmovsw %dx, %cx
-; X86-NEXT:    movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negw %cx
-; X86-NEXT:    cmovsw %si, %cx
-; X86-NEXT:    movw %cx, (%esp) # 2-byte Spill
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    negw %si
-; X86-NEXT:    cmovsw %di, %si
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    negw %di
-; X86-NEXT:    cmovsw %bp, %di
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    negw %bp
-; X86-NEXT:    cmovsw %bx, %bp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    negw %bx
-; X86-NEXT:    cmovsw %ax, %bx
+; X86-NEXT:    cmovsw %ax, %cx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negw %ax
-; X86-NEXT:    cmovsw %cx, %ax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negw %cx
-; X86-NEXT:    cmovsw %dx, %cx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movw %cx, 14(%edx)
-; X86-NEXT:    movw %ax, 12(%edx)
-; X86-NEXT:    movw %bx, 10(%edx)
-; X86-NEXT:    movw %bp, 8(%edx)
-; X86-NEXT:    movw %di, 6(%edx)
-; X86-NEXT:    movw %si, 4(%edx)
-; X86-NEXT:    movzwl (%esp), %eax # 2-byte Folded Reload
-; X86-NEXT:    movw %ax, 2(%edx)
-; X86-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; X86-NEXT:    movw %ax, (%edx)
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    addl $4, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, 12(%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, 10(%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, 8(%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, 6(%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, 4(%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, 2(%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negw %dx
+; X86-NEXT:    cmovsw %cx, %dx
+; X86-NEXT:    movw %dx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 false)
   ret <8 x i16> %r
@@ -488,127 +455,103 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
 ;
 ; X86-LABEL: test_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movb %cl, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %dl, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %ah, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %ah
-; X86-NEXT:    subb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %ch, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %ch
-; X86-NEXT:    subb %al, %ch
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %dh, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %dh
-; X86-NEXT:    subb %al, %dh
-; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %bl
-; X86-NEXT:    subb %al, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb %bh, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %bh
-; X86-NEXT:    subb %al, %bh
-; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarb $7, %al
 ; X86-NEXT:    xorb %al, %cl
 ; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    movb %bh, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %bh
-; X86-NEXT:    subb %al, %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %bl
-; X86-NEXT:    subb %al, %bl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb %dh, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %dh
-; X86-NEXT:    subb %al, %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb %ch, %al
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %ch
-; X86-NEXT:    subb %al, %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %ch, 13(%eax)
-; X86-NEXT:    movb %dh, 12(%eax)
-; X86-NEXT:    movb %bl, 11(%eax)
-; X86-NEXT:    movb %bh, 10(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    movb %cl, 14(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    movb %cl, 13(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    movb %cl, 10(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb %dl, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
   %r = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false)
   ret <16 x i8> %r
@@ -696,16 +639,16 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
@@ -821,8 +764,8 @@ define i64 @test_minsigned_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: test_minsigned_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal -2147483648(%edx), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    jne .LBB20_1
 ; X86-NEXT:  # %bb.2: # %select.end
diff --git a/llvm/test/CodeGen/X86/add-and-not.ll b/llvm/test/CodeGen/X86/add-and-not.ll
index 10e3a6bf6d533..9f4cd50bf56f5 100644
--- a/llvm/test/CodeGen/X86/add-and-not.ll
+++ b/llvm/test/CodeGen/X86/add-and-not.ll
@@ -131,20 +131,23 @@ define i8 @add_and_xor_extra_use(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: add_and_xor_extra_use:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    notb %bh
-; X86-NEXT:    movzbl %bh, %eax
-; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movb %bh, %al
+; X86-NEXT:    notb %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    andb %bl, %al
+; X86-NEXT:    movzbl %al, %esi
 ; X86-NEXT:    calll use at PLT
-; X86-NEXT:    andb %bl, %bh
-; X86-NEXT:    movzbl %bh, %eax
-; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, (%esp)
+; X86-NEXT:    orb %bh, %bl
 ; X86-NEXT:    calll use at PLT
-; X86-NEXT:    orb {{[0-9]+}}(%esp), %bl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
@@ -199,10 +202,10 @@ define i64 @add_and_xor_const(i64 %x) {
 define i64 @add_and_xor_const_wrong_op(i64 %x, i64 %y) {
 ; X86-LABEL: add_and_xor_const_wrong_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    retl
@@ -332,9 +335,9 @@ define i64 @add_and_xor_const_zext_trunc(i64 %x) {
 define i64 @add_and_xor_const_zext_trunc_var(i64 %x, i64 %y) {
 ; X86-LABEL: add_and_xor_const_zext_trunc_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: add_and_xor_const_zext_trunc_var:
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index 85c14d1e0ac04..1b61dbc0b274e 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -59,9 +59,9 @@ define i32 @test_i32_add_add_commute_idx(i32 %x, i32 %y, i32 %z) nounwind {
 define i32 @test_i32_add_add_idx0(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: test_i32_add_add_idx0:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
@@ -109,24 +109,24 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 48(%ebp), %ecx
-; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    addl 24(%ebp), %esi
 ; X86-NEXT:    adcl 28(%ebp), %edi
-; X86-NEXT:    adcl 32(%ebp), %ecx
-; X86-NEXT:    adcl 36(%ebp), %edx
+; X86-NEXT:    adcl 32(%ebp), %edx
+; X86-NEXT:    adcl 36(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    btl $5, 64(%ebp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -153,10 +153,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 define i32 @test_i32_add_sub_idx(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: test_i32_add_sub_idx:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    shrl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -246,11 +246,11 @@ define i32 @test_i32_sub_add_commute_idx(i32 %x, i32 %y, i32 %z) nounwind {
 define i32 @test_i32_sub_add_sext_idx(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: test_i32_sub_add_sext_idx:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $25, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -392,27 +392,25 @@ define i32 @test_i32_add_add_commute_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwin
 define i64 @test_i64_add_add_var(i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
 ; X86-LABEL: test_i64_add_add_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB15_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB15_2:
 ; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_i64_add_add_var:
@@ -431,10 +429,10 @@ define i64 @test_i64_add_add_var(i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
 define i32 @test_i32_add_sub_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 ; X86-LABEL: test_i32_add_sub_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    retl
@@ -456,10 +454,10 @@ define i32 @test_i32_add_sub_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 define i32 @test_i32_add_sub_commute_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 ; X86-LABEL: test_i32_add_sub_commute_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    retl
@@ -481,10 +479,10 @@ define i32 @test_i32_add_sub_commute_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwin
 define i32 @test_i32_sub_add_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 ; X86-LABEL: test_i32_sub_add_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    retl
@@ -530,12 +528,12 @@ define i32 @test_i32_sub_add_commute_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwin
 define i32 @test_i32_sub_add_sext_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 ; X86-LABEL: test_i32_sub_add_sext_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    retl
 ;
@@ -606,10 +604,10 @@ define i32 @test_i32_sub_sub_commute_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwin
 define i32 @test_i32_sub_sum_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 ; X86-LABEL: test_i32_sub_sum_var:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    negl %eax
diff --git a/llvm/test/CodeGen/X86/add.ll b/llvm/test/CodeGen/X86/add.ll
index 079294ef09bdb..f56005b10ec0d 100644
--- a/llvm/test/CodeGen/X86/add.ll
+++ b/llvm/test/CodeGen/X86/add.ll
@@ -220,9 +220,9 @@ carry:
 define i64 @test6(i64 %A, i32 %B) nounwind {
 ; X86-LABEL: test6:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LINUX-LABEL: test6:
@@ -310,10 +310,10 @@ entry:
 define i32 @test9(i32 %x, i32 %y) nounwind readnone {
 ; X86-LABEL: test9:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpl $10, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sete %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -460,23 +460,19 @@ define i32 @inc_not(i32 %a) {
 define <4 x i32> @inc_not_vec(<4 x i32> %a) nounwind {
 ; X86-LABEL: inc_not_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LINUX-LABEL: inc_not_vec:
@@ -500,10 +496,10 @@ define void @uaddo1_not(i32 %a, ptr %p0, ptr %p1) {
 ; X86-LABEL: uaddo1_not:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    setae (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
index beea6d36fe874..e129e3c985daf 100644
--- a/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
+++ b/llvm/test/CodeGen/X86/addr-mode-matcher-3.ll
@@ -28,9 +28,9 @@ define i32 @mask_add_zext_i32_i64(ptr %base, i32 %i) {
 ; X86-LABEL: mask_add_zext_i32_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl 12(%eax,%ecx,4), %eax
+; X86-NEXT:    movl 12(%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mask_add_zext_i32_i64:
@@ -50,10 +50,10 @@ define i32 @mask_add_zext_i32_i64(ptr %base, i32 %i) {
 define i32 @mask_offset_scale_zext_i32_i64(ptr %base, i32 %i) {
 ; X86-LABEL: mask_offset_scale_zext_i32_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $11, %ecx
-; X86-NEXT:    movl 48(%eax,%ecx), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $11, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 48(%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mask_offset_scale_zext_i32_i64:
@@ -123,9 +123,9 @@ define i32 @PR55714_i32(i32 %n, i32 %q) {
 define i64 @PR55714_i64(i64 %n, i64 %q) {
 ; X86-LABEL: PR55714_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $7, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    leal (%edx,%ecx,8), %edx
 ; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/addr-mode-matcher-4.ll b/llvm/test/CodeGen/X86/addr-mode-matcher-4.ll
index a384bc3e55107..1450af3a09063 100644
--- a/llvm/test/CodeGen/X86/addr-mode-matcher-4.ll
+++ b/llvm/test/CodeGen/X86/addr-mode-matcher-4.ll
@@ -9,10 +9,10 @@ define double @zext_shl_mul(ptr %a0, ptr %a1) {
 ; X86-LABEL: zext_shl_mul:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %ecx
-; X86-NEXT:    fldl 16(%eax,%ecx,4)
+; X86-NEXT:    fldl 16(%ecx,%eax,4)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: zext_shl_mul:
diff --git a/llvm/test/CodeGen/X86/addsub-constant-folding.ll b/llvm/test/CodeGen/X86/addsub-constant-folding.ll
index 1cdc81223168f..2a2d7a58ee0d3 100644
--- a/llvm/test/CodeGen/X86/addsub-constant-folding.ll
+++ b/llvm/test/CodeGen/X86/addsub-constant-folding.ll
@@ -276,15 +276,16 @@ define i32 @add_const_const_sub_extrause(i32 %arg) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    leal 8(%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $-6, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    addl $8, %eax
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NEXT:    movl $-6, %eax
-; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -646,15 +647,16 @@ define i32 @sub_const_const_sub_extrause(i32 %arg) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    leal -8(%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $10, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    addl $-8, %eax
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NEXT:    movl $10, %eax
-; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -774,16 +776,17 @@ define i32 @const_sub_add_const_extrause(i32 %arg) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $8, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $8, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl $10, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NEXT:    movl $10, %eax
-; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -908,16 +911,17 @@ define i32 @const_sub_sub_const_extrause(i32 %arg) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl $8, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $8, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl $6, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NEXT:    movl $6, %eax
-; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -1042,15 +1046,16 @@ define i32 @const_sub_const_sub_extrause(i32 %arg) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl $8, %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $8, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $2, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NEXT:    movl $2, %eax
-; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/align-down-const.ll b/llvm/test/CodeGen/X86/align-down-const.ll
index 378b55195acc7..4c883b05ae289 100644
--- a/llvm/test/CodeGen/X86/align-down-const.ll
+++ b/llvm/test/CodeGen/X86/align-down-const.ll
@@ -87,11 +87,11 @@ define i32 @t3_random_constant(i32 %ptr) nounwind {
 define i32 @t4_extrause(i32 %ptr, ptr %bias_storage) nounwind {
 ; X86-LABEL: t4_extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $15, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    andl $-16, %eax
 ; X86-NEXT:    retl
 ;
@@ -114,9 +114,9 @@ define i32 @t4_extrause(i32 %ptr, ptr %bias_storage) nounwind {
 define i32 @n5_different_ptrs(i32 %ptr0, i32 %ptr1) nounwind {
 ; X86-LABEL: n5_different_ptrs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/align-down.ll b/llvm/test/CodeGen/X86/align-down.ll
index c359c04f527a3..3a9768b6ac01d 100644
--- a/llvm/test/CodeGen/X86/align-down.ll
+++ b/llvm/test/CodeGen/X86/align-down.ll
@@ -84,9 +84,9 @@ define i32 @t2_commutative(i32 %ptr, i32 %alignment) nounwind {
 define i32 @t3_extrause0(i32 %ptr, i32 %alignment, ptr %mask_storage) nounwind {
 ; NOBMI-X86-LABEL: t3_extrause0:
 ; NOBMI-X86:       # %bb.0:
-; NOBMI-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; NOBMI-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOBMI-X86-NEXT:    decl %eax
+; NOBMI-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; NOBMI-X86-NEXT:    movl %eax, (%ecx)
 ; NOBMI-X86-NEXT:    notl %eax
 ; NOBMI-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
@@ -95,10 +95,10 @@ define i32 @t3_extrause0(i32 %ptr, i32 %alignment, ptr %mask_storage) nounwind {
 ; BMI-X86-LABEL: t3_extrause0:
 ; BMI-X86:       # %bb.0:
 ; BMI-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI-X86-NEXT:    decl %eax
 ; BMI-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; BMI-X86-NEXT:    decl %ecx
-; BMI-X86-NEXT:    movl %ecx, (%eax)
-; BMI-X86-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %eax
+; BMI-X86-NEXT:    movl %eax, (%ecx)
+; BMI-X86-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
 ; BMI-X86-NEXT:    retl
 ;
 ; NOBMI-X64-LABEL: t3_extrause0:
@@ -126,12 +126,12 @@ define i32 @n4_extrause1(i32 %ptr, i32 %alignment, ptr %bias_storage) nounwind {
 ; X86-LABEL: n4_extrause1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    decl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    decl %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: n4_extrause1:
@@ -151,17 +151,15 @@ define i32 @n4_extrause1(i32 %ptr, i32 %alignment, ptr %bias_storage) nounwind {
 define i32 @n5_extrause2(i32 %ptr, i32 %alignment, ptr %mask_storage, ptr %bias_storage) nounwind {
 ; X86-LABEL: n5_extrause2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    decl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    decl %esi
-; X86-NEXT:    movl %esi, (%edx)
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: n5_extrause2:
@@ -186,10 +184,10 @@ define i32 @n5_extrause2(i32 %ptr, i32 %alignment, ptr %mask_storage, ptr %bias_
 define i32 @n6_different_ptrs(i32 %ptr0, i32 %ptr1, i32 %alignment) nounwind {
 ; X86-LABEL: n6_different_ptrs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    decl %ecx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -208,10 +206,10 @@ define i32 @n6_different_ptrs(i32 %ptr0, i32 %ptr1, i32 %alignment) nounwind {
 define i32 @n7_different_ptrs_commutative(i32 %ptr0, i32 %ptr1, i32 %alignment) nounwind {
 ; X86-LABEL: n7_different_ptrs_commutative:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    decl %ecx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -266,9 +264,9 @@ define i32 @n8_not_lowbit_mask(i32 %ptr, i32 %alignment) nounwind {
 define i32 @n9_sub_is_not_commutative(i32 %ptr, i32 %alignment) nounwind {
 ; X86-LABEL: n9_sub_is_not_commutative:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    decl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/and-mask-variable.ll b/llvm/test/CodeGen/X86/and-mask-variable.ll
index 3e5bd6952147c..e0cd4f4b63a55 100644
--- a/llvm/test/CodeGen/X86/and-mask-variable.ll
+++ b/llvm/test/CodeGen/X86/and-mask-variable.ll
@@ -95,12 +95,9 @@ define i64 @mask_pair_64(i64 %x, i64 %y) nounwind {
 define i128 @mask_pair_128(i128 %x, i128 %y) nounwind {
 ; X86-NOBMI-LABEL: mask_pair_128:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $32, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    subl $36, %esp
 ; X86-NOBMI-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    movl $-1, {{[0-9]+}}(%esp)
@@ -109,42 +106,40 @@ define i128 @mask_pair_128(i128 %x, i128 %y) nounwind {
 ; X86-NOBMI-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    movl $0, (%esp)
-; X86-NOBMI-NEXT:    movl %ecx, %edx
-; X86-NOBMI-NEXT:    shrb $3, %dl
-; X86-NOBMI-NEXT:    andb $12, %dl
-; X86-NOBMI-NEXT:    negb %dl
-; X86-NOBMI-NEXT:    movsbl %dl, %ebx
-; X86-NOBMI-NEXT:    movl 24(%esp,%ebx), %edx
-; X86-NOBMI-NEXT:    movl 28(%esp,%ebx), %esi
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %esi
-; X86-NOBMI-NEXT:    movl 16(%esp,%ebx), %edi
-; X86-NOBMI-NEXT:    movl 20(%esp,%ebx), %ebx
-; X86-NOBMI-NEXT:    shldl %cl, %ebx, %edx
-; X86-NOBMI-NEXT:    shldl %cl, %edi, %ebx
-; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, %eax
+; X86-NOBMI-NEXT:    shrb $3, %al
+; X86-NOBMI-NEXT:    andb $12, %al
+; X86-NOBMI-NEXT:    negb %al
+; X86-NOBMI-NEXT:    movsbl %al, %edi
+; X86-NOBMI-NEXT:    movl 24(%esp,%edi), %esi
+; X86-NOBMI-NEXT:    movl 28(%esp,%edi), %edx
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %edx, 12(%eax)
+; X86-NOBMI-NEXT:    movl 16(%esp,%edi), %edx
+; X86-NOBMI-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NOBMI-NEXT:    shldl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, 8(%eax)
+; X86-NOBMI-NEXT:    shldl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    movl %esi, 12(%eax)
-; X86-NOBMI-NEXT:    movl %edx, 8(%eax)
-; X86-NOBMI-NEXT:    movl %ebx, 4(%eax)
-; X86-NOBMI-NEXT:    movl %edi, (%eax)
-; X86-NOBMI-NEXT:    addl $32, %esp
+; X86-NOBMI-NEXT:    movl %edi, 4(%eax)
+; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shll %cl, %edx
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movl %edx, (%eax)
+; X86-NOBMI-NEXT:    addl $36, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl $4
 ;
 ; X86-BMI2-LABEL: mask_pair_128:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $32, %esp
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    subl $36, %esp
 ; X86-BMI2-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-BMI2-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-BMI2-NEXT:    movl $-1, {{[0-9]+}}(%esp)
@@ -153,31 +148,32 @@ define i128 @mask_pair_128(i128 %x, i128 %y) nounwind {
 ; X86-BMI2-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-BMI2-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-BMI2-NEXT:    movl $0, (%esp)
-; X86-BMI2-NEXT:    movl %ecx, %edx
-; X86-BMI2-NEXT:    shrb $3, %dl
-; X86-BMI2-NEXT:    andb $12, %dl
-; X86-BMI2-NEXT:    negb %dl
-; X86-BMI2-NEXT:    movsbl %dl, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl %ecx, %eax
+; X86-BMI2-NEXT:    shrb $3, %al
+; X86-BMI2-NEXT:    andb $12, %al
+; X86-BMI2-NEXT:    negb %al
+; X86-BMI2-NEXT:    movsbl %al, %edi
 ; X86-BMI2-NEXT:    movl 24(%esp,%edi), %edx
 ; X86-BMI2-NEXT:    movl 28(%esp,%edi), %esi
 ; X86-BMI2-NEXT:    shldl %cl, %edx, %esi
-; X86-BMI2-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %esi, 12(%eax)
+; X86-BMI2-NEXT:    movl 16(%esp,%edi), %esi
 ; X86-BMI2-NEXT:    movl 20(%esp,%edi), %edi
 ; X86-BMI2-NEXT:    shldl %cl, %edi, %edx
-; X86-BMI2-NEXT:    shldl %cl, %ebx, %edi
-; X86-BMI2-NEXT:    shlxl %ecx, %ebx, %ecx
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-BMI2-NEXT:    movl %esi, 12(%eax)
 ; X86-BMI2-NEXT:    movl %edx, 8(%eax)
+; X86-BMI2-NEXT:    shldl %cl, %esi, %edi
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
 ; X86-BMI2-NEXT:    movl %edi, 4(%eax)
+; X86-BMI2-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %ecx, (%eax)
-; X86-BMI2-NEXT:    addl $32, %esp
+; X86-BMI2-NEXT:    addl $36, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl $4
 ;
 ; X64-NOBMI-LABEL: mask_pair_128:
diff --git a/llvm/test/CodeGen/X86/and-or-fold.ll b/llvm/test/CodeGen/X86/and-or-fold.ll
index 1bb5fdeebac71..171eb1bade873 100644
--- a/llvm/test/CodeGen/X86/and-or-fold.ll
+++ b/llvm/test/CodeGen/X86/and-or-fold.ll
@@ -42,9 +42,9 @@ define i32 @or_and_trunc_fold(i64 %x, i64 %y) {
 define i32 @test1(i32 %x, i16 %y) {
 ; DARWIN-LABEL: test1:
 ; DARWIN:       ## %bb.0:
-; DARWIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; DARWIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; DARWIN-NEXT:    shll $16, %eax
+; DARWIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DARWIN-NEXT:    shll $16, %ecx
+; DARWIN-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; DARWIN-NEXT:    orl %ecx, %eax
 ; DARWIN-NEXT:    andl $16711807, %eax ## imm = 0xFF007F
 ; DARWIN-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/and-or-setcc.ll b/llvm/test/CodeGen/X86/and-or-setcc.ll
index 4484f23bbda36..bcf65e58ace87 100644
--- a/llvm/test/CodeGen/X86/and-or-setcc.ll
+++ b/llvm/test/CodeGen/X86/and-or-setcc.ll
@@ -59,50 +59,42 @@ define i1 @or_uno(float %a, float %b) {
 define <4 x i1> @and_ord_vec(<4 x float> %a, <4 x float> %b) {
 ; X86-LABEL: and_ord_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp) # 4-byte Folded Spill
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
 ; X86-NEXT:    fnstsw %ax
-; X86-NEXT:    fucompp
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    setnp %dl
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
-; X86-NEXT:    setnp %dh
-; X86-NEXT:    shlb $2, %dh
+; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    fnstsw %ax
-; X86-NEXT:    flds (%esp) # 4-byte Folded Reload
-; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    setnp %cl
+; X86-NEXT:    shlb $3, %cl
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setnp %dl
-; X86-NEXT:    shlb $3, %dl
-; X86-NEXT:    orb %dh, %dl
+; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setnp %dh
+; X86-NEXT:    setnp %dl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucompp
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setnp %al
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    orb %dh, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    movb %al, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %ecx
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    setnp %ch
+; X86-NEXT:    addb %ch, %ch
+; X86-NEXT:    orb %dl, %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %ch, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: and_ord_vec:
@@ -118,50 +110,42 @@ define <4 x i1> @and_ord_vec(<4 x float> %a, <4 x float> %b) {
 define <4 x i1> @or_uno_vec(<4 x float> %a, <4 x float> %b) {
 ; X86-LABEL: or_uno_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp) # 4-byte Folded Spill
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
 ; X86-NEXT:    fnstsw %ax
-; X86-NEXT:    fucompp
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    setp %dl
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
-; X86-NEXT:    setp %dh
-; X86-NEXT:    shlb $2, %dh
+; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    fnstsw %ax
-; X86-NEXT:    flds (%esp) # 4-byte Folded Reload
-; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    setp %cl
+; X86-NEXT:    shlb $3, %cl
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucompp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setp %dl
-; X86-NEXT:    shlb $3, %dl
-; X86-NEXT:    orb %dh, %dl
+; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %dh
+; X86-NEXT:    setp %dl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucompp
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %al
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    orb %dh, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    movb %al, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %ecx
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    setp %ch
+; X86-NEXT:    addb %ch, %ch
+; X86-NEXT:    orb %dl, %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %ch, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: or_uno_vec:
diff --git a/llvm/test/CodeGen/X86/and-sink.ll b/llvm/test/CodeGen/X86/and-sink.ll
index d9a34d743ac65..8a83d36c9c923 100644
--- a/llvm/test/CodeGen/X86/and-sink.ll
+++ b/llvm/test/CodeGen/X86/and-sink.ll
@@ -15,8 +15,8 @@ define i32 @and_sink1(i32 %a, i1 %c) {
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.1: # %bb0
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    testb $4, %al
 ; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    testb $4, %al
 ; CHECK-NEXT:    jne .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
@@ -62,8 +62,8 @@ define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
 ; CHECK-NEXT:    je .LBB1_5
 ; CHECK-NEXT:  # %bb.3: # %bb1
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    testb $4, %cl
 ; CHECK-NEXT:    movl $0, C
+; CHECK-NEXT:    testb $4, %cl
 ; CHECK-NEXT:    jne .LBB1_2
 ; CHECK-NEXT:  # %bb.4: # %bb2
 ; CHECK-NEXT:    movl $1, %eax
@@ -107,8 +107,8 @@ define i32 @and_sink3(i1 %c, ptr %p) {
 ; CHECK-NEXT:  # %bb.1: # %bb0
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movzbl (%eax), %eax
-; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    je .LBB2_2
 ; CHECK-NEXT:  .LBB2_3: # %bb2
 ; CHECK-NEXT:    xorl %eax, %eax
@@ -145,13 +145,13 @@ define i32 @and_sink4(i32 %a, i32 %b, i1 %c) {
 ; CHECK-NEXT:  # %bb.1: # %bb0
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    testl %eax, %ecx
 ; CHECK-NEXT:    movl $0, A
+; CHECK-NEXT:    testl %eax, %ecx
 ; CHECK-NEXT:    jne .LBB3_4
 ; CHECK-NEXT:  # %bb.2: # %bb1
 ; CHECK-NEXT:    leal (%ecx,%eax), %edx
-; CHECK-NEXT:    testl %eax, %ecx
 ; CHECK-NEXT:    movl %edx, B
+; CHECK-NEXT:    testl %eax, %ecx
 ; CHECK-NEXT:    je .LBB3_3
 ; CHECK-NEXT:  .LBB3_4: # %bb3
 ; CHECK-NEXT:    xorl %eax, %eax
@@ -201,8 +201,8 @@ define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) {
 ; CHECK-NEXT:  # %bb.2: # %bb1
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    movl %ecx, B
+; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    je .LBB4_3
 ; CHECK-NEXT:  .LBB4_4: # %bb3
 ; CHECK-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/and-su.ll b/llvm/test/CodeGen/X86/and-su.ll
index b657ed28f6dc6..4e2809cd1a7c8 100644
--- a/llvm/test/CodeGen/X86/and-su.ll
+++ b/llvm/test/CodeGen/X86/and-su.ll
@@ -30,10 +30,10 @@ define fastcc double @bar(i32 %hash, double %x, double %y) nounwind {
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-8, %esp
-; CHECK-NEXT:    fldl 16(%ebp)
-; CHECK-NEXT:    fldl 8(%ebp)
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    andl $15, %eax
+; CHECK-NEXT:    fldl 16(%ebp)
+; CHECK-NEXT:    fldl 8(%ebp)
 ; CHECK-NEXT:    cmpl $8, %eax
 ; CHECK-NEXT:    jb .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %bb10
diff --git a/llvm/test/CodeGen/X86/andimm8.ll b/llvm/test/CodeGen/X86/andimm8.ll
index 6242d4f4c222b..00431c49d273e 100644
--- a/llvm/test/CodeGen/X86/andimm8.ll
+++ b/llvm/test/CodeGen/X86/andimm8.ll
@@ -25,13 +25,13 @@ define i64 @bra(i32 %zed) nounwind {
 define void @foo(i64 %zed, ptr %x) nounwind {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x08]
-; X86-NEXT:    andl $-4, %ecx # encoding: [0x83,0xe1,0xfc]
-; X86-NEXT:    orl $2, %ecx # encoding: [0x83,0xc9,0x02]
-; X86-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
-; X86-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    movl %ecx, 4(%edx) # encoding: [0x89,0x4a,0x04]
+; X86-NEXT:    andl $-4, %eax # encoding: [0x83,0xe0,0xfc]
+; X86-NEXT:    orl $2, %eax # encoding: [0x83,0xc8,0x02]
+; X86-NEXT:    movl %eax, (%edx) # encoding: [0x89,0x02]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/andnot-blsmsk.ll b/llvm/test/CodeGen/X86/andnot-blsmsk.ll
index 74766821f6ce7..c89d13f9b58c2 100644
--- a/llvm/test/CodeGen/X86/andnot-blsmsk.ll
+++ b/llvm/test/CodeGen/X86/andnot-blsmsk.ll
@@ -150,15 +150,15 @@ define i64 @fold_and_xor_neg_v1_64(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: fold_and_xor_neg_v1_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
index fc573fbd4fc99..725be0cf72447 100644
--- a/llvm/test/CodeGen/X86/andnot-patterns.ll
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -17,9 +17,9 @@ define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-NOBMI-LABEL: andnot_rotl_i64:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB0_1
 ; X86-NOBMI-NEXT:  # %bb.2:
@@ -43,9 +43,9 @@ define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-BMI-LABEL: andnot_rotl_i64:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    testb $32, %cl
 ; X86-BMI-NEXT:    jne .LBB0_1
 ; X86-BMI-NEXT:  # %bb.2:
@@ -196,30 +196,30 @@ define i64 @andnot_rotl_i64_multiuse_rot(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    notl %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    notl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    jmp .LBB4_3
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:  .LBB4_3:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    andl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    andl %ebx, %edi
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll use_i64 at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    movl %esi, %eax
@@ -258,9 +258,9 @@ define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-NOBMI-LABEL: andnot_rotr_i64:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB5_1
 ; X86-NOBMI-NEXT:  # %bb.2:
@@ -284,9 +284,9 @@ define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-BMI-LABEL: andnot_rotr_i64:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    testb $32, %cl
 ; X86-BMI-NEXT:    je .LBB5_1
 ; X86-BMI-NEXT:  # %bb.2:
@@ -435,9 +435,9 @@ define i32 @andnot_rotr_i32_multiuse_not(i32 %a0, i32 %a1, i32 %a2) nounwind {
 ; X86-NOBMI-LABEL: andnot_rotr_i32_multiuse_not:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    notl %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl %eax, %esi
 ; X86-NOBMI-NEXT:    rorl %cl, %esi
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
@@ -451,10 +451,10 @@ define i32 @andnot_rotr_i32_multiuse_not(i32 %a0, i32 %a1, i32 %a2) nounwind {
 ; X86-BMI-LABEL: andnot_rotr_i32_multiuse_not:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl %eax, %edx
 ; X86-BMI-NEXT:    notl %edx
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    rorl %cl, %eax
 ; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %esi
 ; X86-BMI-NEXT:    pushl %edx
diff --git a/llvm/test/CodeGen/X86/andnot-sink-not.ll b/llvm/test/CodeGen/X86/andnot-sink-not.ll
index fefbdc84699f4..d3ed044e60e2d 100644
--- a/llvm/test/CodeGen/X86/andnot-sink-not.ll
+++ b/llvm/test/CodeGen/X86/andnot-sink-not.ll
@@ -484,128 +484,122 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
 ; X86-LABEL: and_sink_not_v8i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB8_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    notb %dh
-; X86-NEXT:    andb %ch, %dh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    notb %ch
 ; X86-NEXT:    andb %dl, %ch
+; X86-NEXT:    movb %ch, (%eax)
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    notb %dl
 ; X86-NEXT:    andb %cl, %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %dl, 1(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    notb %cl
 ; X86-NEXT:    andb %bh, %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    notb %bh
-; X86-NEXT:    andb %bl, %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    notb %bl
-; X86-NEXT:    andb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    notb %al
-; X86-NEXT:    andb %ah, %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    notb %ah
-; X86-NEXT:    andb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb %ah, (%esi)
-; X86-NEXT:    movb %al, 1(%esi)
-; X86-NEXT:    movb %bl, 2(%esi)
-; X86-NEXT:    movb %bh, 3(%esi)
-; X86-NEXT:    movb %cl, 4(%esi)
-; X86-NEXT:    movb %dl, 5(%esi)
-; X86-NEXT:    movb %ch, 6(%esi)
-; X86-NEXT:    movb %dh, 7(%esi)
-; X86-NEXT:    jmp .LBB8_3
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %bl, %cl
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %dh, %cl
+; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %cl, 7(%eax)
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB8_2: # %identity
-; X86-NEXT:    movb %al, (%esi)
-; X86-NEXT:    movb %ah, 1(%esi)
-; X86-NEXT:    movb %dh, 2(%esi)
-; X86-NEXT:    movb %bl, 3(%esi)
-; X86-NEXT:    movb %bh, 4(%esi)
-; X86-NEXT:    movb %cl, 5(%esi)
-; X86-NEXT:    movb %dl, 6(%esi)
-; X86-NEXT:    movb %ch, 7(%esi)
-; X86-NEXT:  .LBB8_3: # %identity
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movb %dl, (%eax)
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb %bl, 3(%eax)
+; X86-NEXT:    movb %dh, 4(%eax)
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movb %ch, 7(%eax)
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; X86-SSE-LABEL: and_sink_not_v8i8:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    je .LBB8_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    notb %dh
-; X86-SSE-NEXT:    andb %ch, %dh
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-SSE-NEXT:    notb %ch
 ; X86-SSE-NEXT:    andb %dl, %ch
+; X86-SSE-NEXT:    movb %ch, (%eax)
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-SSE-NEXT:    notb %dl
 ; X86-SSE-NEXT:    andb %cl, %dl
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb %dl, 1(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    notb %cl
 ; X86-SSE-NEXT:    andb %bh, %cl
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-SSE-NEXT:    notb %bh
-; X86-SSE-NEXT:    andb %bl, %bh
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; X86-SSE-NEXT:    notb %bl
-; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %bl
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-SSE-NEXT:    notb %al
-; X86-SSE-NEXT:    andb %ah, %al
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-SSE-NEXT:    notb %ah
-; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %ah
-; X86-SSE-NEXT:    movb %ah, (%esi)
-; X86-SSE-NEXT:    movb %al, 1(%esi)
-; X86-SSE-NEXT:    movb %bl, 2(%esi)
-; X86-SSE-NEXT:    movb %bh, 3(%esi)
-; X86-SSE-NEXT:    movb %cl, 4(%esi)
-; X86-SSE-NEXT:    movb %dl, 5(%esi)
-; X86-SSE-NEXT:    movb %ch, 6(%esi)
-; X86-SSE-NEXT:    movb %dh, 7(%esi)
-; X86-SSE-NEXT:    jmp .LBB8_3
+; X86-SSE-NEXT:    movb %cl, 2(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb %bl, %cl
+; X86-SSE-NEXT:    movb %cl, 3(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb %dh, %cl
+; X86-SSE-NEXT:    movb %cl, 4(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb %cl, 5(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb %cl, 6(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb %cl, 7(%eax)
+; X86-SSE-NEXT:    popl %ebx
+; X86-SSE-NEXT:    retl $4
 ; X86-SSE-NEXT:  .LBB8_2: # %identity
-; X86-SSE-NEXT:    movb %al, (%esi)
-; X86-SSE-NEXT:    movb %ah, 1(%esi)
-; X86-SSE-NEXT:    movb %dh, 2(%esi)
-; X86-SSE-NEXT:    movb %bl, 3(%esi)
-; X86-SSE-NEXT:    movb %bh, 4(%esi)
-; X86-SSE-NEXT:    movb %cl, 5(%esi)
-; X86-SSE-NEXT:    movb %dl, 6(%esi)
-; X86-SSE-NEXT:    movb %ch, 7(%esi)
-; X86-SSE-NEXT:  .LBB8_3: # %identity
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movb %dl, (%eax)
+; X86-SSE-NEXT:    movb %cl, 1(%eax)
+; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movb %bl, 3(%eax)
+; X86-SSE-NEXT:    movb %dh, 4(%eax)
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb %cl, 5(%eax)
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-SSE-NEXT:    movb %cl, 6(%eax)
+; X86-SSE-NEXT:    movb %ch, 7(%eax)
 ; X86-SSE-NEXT:    popl %ebx
 ; X86-SSE-NEXT:    retl $4
 ;
@@ -622,64 +616,61 @@ define <8 x i8> @and_sink_not_v8i8(<8 x i8> %x, <8 x i8> %m, i1 zeroext %cond) n
 ; X86-BMI-LABEL: and_sink_not_v8i8:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %ebx
-; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-BMI-NEXT:    je .LBB8_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI-NEXT:    notb %cl
-; X86-BMI-NEXT:    andb %dh, %cl
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    notb %dh
-; X86-BMI-NEXT:    andb %ch, %dh
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI-NEXT:    notb %ch
 ; X86-BMI-NEXT:    andb %dl, %ch
+; X86-BMI-NEXT:    movb %ch, (%eax)
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-BMI-NEXT:    notb %dl
-; X86-BMI-NEXT:    andb %bh, %dl
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-BMI-NEXT:    notb %bh
-; X86-BMI-NEXT:    andb %bl, %bh
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bl
-; X86-BMI-NEXT:    notb %bl
-; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %bl
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI-NEXT:    notb %al
-; X86-BMI-NEXT:    andb %ah, %al
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-BMI-NEXT:    notb %ah
-; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %ah
-; X86-BMI-NEXT:    movb %ah, (%esi)
-; X86-BMI-NEXT:    movb %al, 1(%esi)
-; X86-BMI-NEXT:    movb %bl, 2(%esi)
-; X86-BMI-NEXT:    movb %bh, 3(%esi)
-; X86-BMI-NEXT:    movb %dl, 4(%esi)
-; X86-BMI-NEXT:    movb %ch, 5(%esi)
-; X86-BMI-NEXT:    movb %dh, 6(%esi)
-; X86-BMI-NEXT:    movb %cl, 7(%esi)
-; X86-BMI-NEXT:    jmp .LBB8_3
+; X86-BMI-NEXT:    andb %cl, %dl
+; X86-BMI-NEXT:    movb %dl, 1(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb %bh, %cl
+; X86-BMI-NEXT:    movb %cl, 2(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb %bl, %cl
+; X86-BMI-NEXT:    movb %cl, 3(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb %dh, %cl
+; X86-BMI-NEXT:    movb %cl, 4(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT:    movb %cl, 5(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT:    movb %cl, 6(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT:    movb %cl, 7(%eax)
+; X86-BMI-NEXT:    popl %ebx
+; X86-BMI-NEXT:    retl $4
 ; X86-BMI-NEXT:  .LBB8_2: # %identity
-; X86-BMI-NEXT:    movb %al, (%esi)
-; X86-BMI-NEXT:    movb %ah, 1(%esi)
-; X86-BMI-NEXT:    movb %cl, 2(%esi)
-; X86-BMI-NEXT:    movb %bl, 3(%esi)
-; X86-BMI-NEXT:    movb %bh, 4(%esi)
-; X86-BMI-NEXT:    movb %dl, 5(%esi)
-; X86-BMI-NEXT:    movb %ch, 6(%esi)
-; X86-BMI-NEXT:    movb %dh, 7(%esi)
-; X86-BMI-NEXT:  .LBB8_3: # %identity
-; X86-BMI-NEXT:    movl %esi, %eax
-; X86-BMI-NEXT:    popl %esi
+; X86-BMI-NEXT:    movb %dl, (%eax)
+; X86-BMI-NEXT:    movb %cl, 1(%eax)
+; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    movb %bl, 3(%eax)
+; X86-BMI-NEXT:    movb %dh, 4(%eax)
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT:    movb %cl, 5(%eax)
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT:    movb %cl, 6(%eax)
+; X86-BMI-NEXT:    movb %ch, 7(%eax)
 ; X86-BMI-NEXT:    popl %ebx
 ; X86-BMI-NEXT:    retl $4
 ;
@@ -716,144 +707,128 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
 ; X86-LABEL: and_sink_not_v8i8_swapped:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB9_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movb %ch, %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    andb %ch, %dh
-; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    andb %ch, %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    andb %ch, %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    andb %ch, %bl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    andb %ch, %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    andb %ch, %cl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    notb %ch
 ; X86-NEXT:    andb %ch, %dl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    andb %ch, %dh
-; X86-NEXT:    movb %dh, (%esi)
-; X86-NEXT:    movb %dl, 1(%esi)
-; X86-NEXT:    movb %cl, 2(%esi)
-; X86-NEXT:    movb %bh, 3(%esi)
-; X86-NEXT:    movb %bl, 4(%esi)
-; X86-NEXT:    movb %al, 5(%esi)
-; X86-NEXT:    movb %ah, 6(%esi)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 7(%esi)
-; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:    movb %dl, (%eax)
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    notb %dl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %cl, %bh
+; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %cl, %bl
+; X86-NEXT:    movb %bl, 3(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    andb %cl, %dh
+; X86-NEXT:    movb %dh, 4(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    movb %dl, 5(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    movb %dl, 6(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    movb %dl, 7(%eax)
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB9_2: # %identity
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb %dh, (%esi)
-; X86-NEXT:    movb %dl, 1(%esi)
-; X86-NEXT:    movb %cl, 2(%esi)
-; X86-NEXT:    movb %bh, 3(%esi)
-; X86-NEXT:    movb %bl, 4(%esi)
+; X86-NEXT:    movb %dl, (%eax)
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb %bl, 3(%eax)
+; X86-NEXT:    movb %dh, 4(%eax)
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movb %cl, 5(%esi)
+; X86-NEXT:    movb %cl, 5(%eax)
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movb %cl, 6(%esi)
-; X86-NEXT:    movb %ch, 7(%esi)
-; X86-NEXT:  .LBB9_3: # %identity
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $4, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movb %ch, 7(%eax)
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; X86-SSE-LABEL: and_sink_not_v8i8_swapped:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    je .LBB9_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movb %ch, %dh
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    andb %ch, %dh
-; X86-SSE-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-SSE-NEXT:    andb %ch, %ah
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    andb %ch, %al
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    andb %ch, %bl
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    andb %ch, %bh
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    andb %ch, %cl
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-SSE-NEXT:    notb %ch
 ; X86-SSE-NEXT:    andb %ch, %dl
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    andb %ch, %dh
-; X86-SSE-NEXT:    movb %dh, (%esi)
-; X86-SSE-NEXT:    movb %dl, 1(%esi)
-; X86-SSE-NEXT:    movb %cl, 2(%esi)
-; X86-SSE-NEXT:    movb %bh, 3(%esi)
-; X86-SSE-NEXT:    movb %bl, 4(%esi)
-; X86-SSE-NEXT:    movb %al, 5(%esi)
-; X86-SSE-NEXT:    movb %ah, 6(%esi)
-; X86-SSE-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE-NEXT:    movb %cl, 7(%esi)
-; X86-SSE-NEXT:    jmp .LBB9_3
+; X86-SSE-NEXT:    movb %dl, (%eax)
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-SSE-NEXT:    notb %dl
+; X86-SSE-NEXT:    andb %dl, %cl
+; X86-SSE-NEXT:    movb %cl, 1(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb %cl, %bh
+; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb %cl, %bl
+; X86-SSE-NEXT:    movb %bl, 3(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    andb %cl, %dh
+; X86-SSE-NEXT:    movb %dh, 4(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    andb %cl, %dl
+; X86-SSE-NEXT:    movb %dl, 5(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    andb %cl, %dl
+; X86-SSE-NEXT:    movb %dl, 6(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notb %cl
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    andb %cl, %dl
+; X86-SSE-NEXT:    movb %dl, 7(%eax)
+; X86-SSE-NEXT:    popl %ebx
+; X86-SSE-NEXT:    retl $4
 ; X86-SSE-NEXT:  .LBB9_2: # %identity
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    movb %dh, (%esi)
-; X86-SSE-NEXT:    movb %dl, 1(%esi)
-; X86-SSE-NEXT:    movb %cl, 2(%esi)
-; X86-SSE-NEXT:    movb %bh, 3(%esi)
-; X86-SSE-NEXT:    movb %bl, 4(%esi)
+; X86-SSE-NEXT:    movb %dl, (%eax)
+; X86-SSE-NEXT:    movb %cl, 1(%eax)
+; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movb %bl, 3(%eax)
+; X86-SSE-NEXT:    movb %dh, 4(%eax)
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-SSE-NEXT:    movb %cl, 5(%esi)
+; X86-SSE-NEXT:    movb %cl, 5(%eax)
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-SSE-NEXT:    movb %cl, 6(%esi)
-; X86-SSE-NEXT:    movb %ch, 7(%esi)
-; X86-SSE-NEXT:  .LBB9_3: # %identity
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $4, %esp
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movb %cl, 6(%eax)
+; X86-SSE-NEXT:    movb %ch, 7(%eax)
 ; X86-SSE-NEXT:    popl %ebx
 ; X86-SSE-NEXT:    retl $4
 ;
@@ -870,72 +845,64 @@ define <8 x i8> @and_sink_not_v8i8_swapped(<8 x i8> %x, <8 x i8> %m, i1 zeroext
 ; X86-BMI-LABEL: and_sink_not_v8i8_swapped:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %ebx
-; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    pushl %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-BMI-NEXT:    je .LBB9_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
-; X86-BMI-NEXT:    movb %ch, %dh
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    andb %ch, %dh
-; X86-BMI-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-BMI-NEXT:    andb %ch, %ah
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    andb %ch, %al
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    andb %ch, %bl
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    andb %ch, %bh
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    andb %ch, %cl
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI-NEXT:    notb %ch
 ; X86-BMI-NEXT:    andb %ch, %dl
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    andb %ch, %dh
-; X86-BMI-NEXT:    movb %dh, (%esi)
-; X86-BMI-NEXT:    movb %dl, 1(%esi)
-; X86-BMI-NEXT:    movb %cl, 2(%esi)
-; X86-BMI-NEXT:    movb %bh, 3(%esi)
-; X86-BMI-NEXT:    movb %bl, 4(%esi)
-; X86-BMI-NEXT:    movb %al, 5(%esi)
-; X86-BMI-NEXT:    movb %ah, 6(%esi)
-; X86-BMI-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-BMI-NEXT:    movb %cl, 7(%esi)
-; X86-BMI-NEXT:    jmp .LBB9_3
+; X86-BMI-NEXT:    movb %dl, (%eax)
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI-NEXT:    notb %dl
+; X86-BMI-NEXT:    andb %dl, %cl
+; X86-BMI-NEXT:    movb %cl, 1(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb %cl, %bh
+; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb %cl, %bl
+; X86-BMI-NEXT:    movb %bl, 3(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    andb %cl, %dh
+; X86-BMI-NEXT:    movb %dh, 4(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andb %cl, %dl
+; X86-BMI-NEXT:    movb %dl, 5(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andb %cl, %dl
+; X86-BMI-NEXT:    movb %dl, 6(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    notb %cl
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andb %cl, %dl
+; X86-BMI-NEXT:    movb %dl, 7(%eax)
+; X86-BMI-NEXT:    popl %ebx
+; X86-BMI-NEXT:    retl $4
 ; X86-BMI-NEXT:  .LBB9_2: # %identity
-; X86-BMI-NEXT:    movb %dh, (%esi)
-; X86-BMI-NEXT:    movb %dl, 1(%esi)
-; X86-BMI-NEXT:    movb %cl, 2(%esi)
-; X86-BMI-NEXT:    movb %bh, 3(%esi)
-; X86-BMI-NEXT:    movb %bl, 4(%esi)
+; X86-BMI-NEXT:    movb %dl, (%eax)
+; X86-BMI-NEXT:    movb %cl, 1(%eax)
+; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    movb %bl, 3(%eax)
+; X86-BMI-NEXT:    movb %dh, 4(%eax)
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI-NEXT:    movb %cl, 5(%esi)
+; X86-BMI-NEXT:    movb %cl, 5(%eax)
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI-NEXT:    movb %cl, 6(%esi)
-; X86-BMI-NEXT:    movb %ch, 7(%esi)
-; X86-BMI-NEXT:  .LBB9_3: # %identity
-; X86-BMI-NEXT:    movl %esi, %eax
-; X86-BMI-NEXT:    addl $4, %esp
-; X86-BMI-NEXT:    popl %esi
+; X86-BMI-NEXT:    movb %cl, 6(%eax)
+; X86-BMI-NEXT:    movb %ch, 7(%eax)
 ; X86-BMI-NEXT:    popl %ebx
 ; X86-BMI-NEXT:    retl $4
 ;
@@ -975,35 +942,35 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB10_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl %ebx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %esi, %ebx
+; X86-NEXT:    andl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    jmp .LBB10_3
 ; X86-NEXT:  .LBB10_2: # %identity
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:  .LBB10_3: # %identity
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1019,16 +986,15 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $64, %esp
+; X86-SSE-NEXT:    movups 28(%ebp), %xmm0
+; X86-SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE-NEXT:    movl 24(%ebp), %ecx
 ; X86-SSE-NEXT:    movl 20(%ebp), %edx
 ; X86-SSE-NEXT:    movl 16(%ebp), %esi
-; X86-SSE-NEXT:    movzbl 44(%ebp), %ebx
-; X86-SSE-NEXT:    testb %bl, %bl
 ; X86-SSE-NEXT:    movl 12(%ebp), %edi
-; X86-SSE-NEXT:    movups 28(%ebp), %xmm0
-; X86-SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    cmpb $0, 44(%ebp)
 ; X86-SSE-NEXT:    je .LBB10_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -1092,26 +1058,31 @@ define <4 x i32> @and_sink_not_v4i32(<4 x i32> %x, <4 x i32> %m, i1 zeroext %con
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-BMI-NEXT:    je .LBB10_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    andnl %edi, %ebx, %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %esi, %ebx, %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %edx, %ebx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %ecx, %ebx, %ecx
+; X86-BMI-NEXT:    movl %edi, (%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    andnl %esi, %edi, %esi
+; X86-BMI-NEXT:    movl %esi, 4(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andnl %ecx, %edx, %ecx
+; X86-BMI-NEXT:    jmp .LBB10_3
 ; X86-BMI-NEXT:  .LBB10_2: # %identity
-; X86-BMI-NEXT:    movl %ecx, (%eax)
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
-; X86-BMI-NEXT:    movl %esi, 8(%eax)
-; X86-BMI-NEXT:    movl %edi, 12(%eax)
+; X86-BMI-NEXT:    movl %edi, (%eax)
+; X86-BMI-NEXT:    movl %esi, 4(%eax)
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:  .LBB10_3: # %identity
+; X86-BMI-NEXT:    movl %ecx, 12(%eax)
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -1162,20 +1133,25 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
 ; X86-NEXT:  # %bb.1: # %mask
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %ebx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    notl %ebx
 ; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    jmp .LBB11_3
 ; X86-NEXT:  .LBB11_2: # %identity
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:  .LBB11_3: # %identity
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1191,16 +1167,15 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $64, %esp
-; X86-SSE-NEXT:    movl 8(%ebp), %eax
-; X86-SSE-NEXT:    movl 24(%ebp), %ecx
-; X86-SSE-NEXT:    movl 20(%ebp), %edx
-; X86-SSE-NEXT:    movl 16(%ebp), %esi
-; X86-SSE-NEXT:    movzbl 44(%ebp), %ebx
-; X86-SSE-NEXT:    testb %bl, %bl
-; X86-SSE-NEXT:    movl 12(%ebp), %edi
 ; X86-SSE-NEXT:    movups 28(%ebp), %xmm0
 ; X86-SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movl 8(%ebp), %eax
+; X86-SSE-NEXT:    movl 24(%ebp), %ecx
+; X86-SSE-NEXT:    movl 20(%ebp), %edx
+; X86-SSE-NEXT:    movl 16(%ebp), %esi
+; X86-SSE-NEXT:    movl 12(%ebp), %edi
+; X86-SSE-NEXT:    cmpb $0, 44(%ebp)
 ; X86-SSE-NEXT:    je .LBB11_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -1264,26 +1239,31 @@ define <4 x i32> @and_sink_not_v4i32_swapped(<4 x i32> %x, <4 x i32> %m, i1 zero
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-BMI-NEXT:    je .LBB11_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    andnl %edi, %ebx, %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %esi, %ebx, %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %edx, %ebx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %ecx, %ebx, %ecx
+; X86-BMI-NEXT:    movl %edi, (%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    andnl %esi, %edi, %esi
+; X86-BMI-NEXT:    movl %esi, 4(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andnl %ecx, %edx, %ecx
+; X86-BMI-NEXT:    jmp .LBB11_3
 ; X86-BMI-NEXT:  .LBB11_2: # %identity
-; X86-BMI-NEXT:    movl %ecx, (%eax)
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
-; X86-BMI-NEXT:    movl %esi, 8(%eax)
-; X86-BMI-NEXT:    movl %edi, 12(%eax)
+; X86-BMI-NEXT:    movl %edi, (%eax)
+; X86-BMI-NEXT:    movl %esi, 4(%eax)
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:  .LBB11_3: # %identity
+; X86-BMI-NEXT:    movl %ecx, 12(%eax)
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -1325,69 +1305,61 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    je .LBB12_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    andl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    notl %ebp
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, (%edx)
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl %edi, 8(%edx)
-; X86-NEXT:    movl %ebp, 12(%edx)
-; X86-NEXT:    movl %ecx, 16(%edx)
-; X86-NEXT:    movl %esi, 20(%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %ebx, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    jmp .LBB12_3
 ; X86-NEXT:  .LBB12_2: # %identity
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ebp, 16(%eax)
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl %esi, 28(%eax)
 ; X86-NEXT:  .LBB12_3: # %identity
-; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1400,69 +1372,61 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $8, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    je .LBB12_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    notl %edi
-; X86-SSE-NEXT:    andl %esi, %edi
-; X86-SSE-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    notl %esi
 ; X86-SSE-NEXT:    andl %edx, %esi
-; X86-SSE-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, (%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    notl %edx
+; X86-SSE-NEXT:    andl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    notl %ecx
 ; X86-SSE-NEXT:    andl %ebp, %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SSE-NEXT:    notl %ebp
-; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ebp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    notl %edi
-; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    movl %eax, %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    notl %eax
-; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    notl %ebx
-; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl %ebx, (%edx)
-; X86-SSE-NEXT:    movl %eax, 4(%edx)
-; X86-SSE-NEXT:    movl %edi, 8(%edx)
-; X86-SSE-NEXT:    movl %ebp, 12(%edx)
-; X86-SSE-NEXT:    movl %ecx, 16(%edx)
-; X86-SSE-NEXT:    movl %esi, 20(%edx)
-; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SSE-NEXT:    movl %eax, 24(%edx)
-; X86-SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE-NEXT:    movl %eax, 28(%edx)
-; X86-SSE-NEXT:    movl %edx, %eax
+; X86-SSE-NEXT:    movl %ecx, 8(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl %ebx, %ecx
+; X86-SSE-NEXT:    movl %ecx, 12(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl %edi, %ecx
+; X86-SSE-NEXT:    movl %ecx, 16(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl %ecx, 20(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl %ecx, 24(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl %ecx, 28(%eax)
 ; X86-SSE-NEXT:    jmp .LBB12_3
 ; X86-SSE-NEXT:  .LBB12_2: # %identity
-; X86-SSE-NEXT:    movl %ebx, (%eax)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl %ebx, 4(%eax)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl %ebx, 8(%eax)
-; X86-SSE-NEXT:    movl %edi, 12(%eax)
-; X86-SSE-NEXT:    movl %ebp, 16(%eax)
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    movl %ebp, 8(%eax)
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    movl %edi, 16(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE-NEXT:    movl %edx, 24(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl %ecx, 24(%eax)
 ; X86-SSE-NEXT:    movl %esi, 28(%eax)
 ; X86-SSE-NEXT:  .LBB12_3: # %identity
-; X86-SSE-NEXT:    addl $8, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -1494,60 +1458,53 @@ define <4 x i64> @and_sink_not_v4i64(<4 x i64> %x, <4 x i64> %m, i1 zeroext %con
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $8, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    je .LBB12_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %ebp, %esi, %esi
-; X86-BMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %ebx, %esi, %esi
-; X86-BMI-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %edi, %esi, %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %esi
-; X86-BMI-NEXT:    movl %ecx, %ebx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
+; X86-BMI-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI-NEXT:    movl %edx, (%eax)
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    andnl %ebx, %edx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ebp, %ebp
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ebx, %ebx
-; X86-BMI-NEXT:    movl %ebx, (%eax)
-; X86-BMI-NEXT:    movl %ebp, 4(%eax)
-; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    andnl %ecx, %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl %ebp, %ecx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 8(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl %ebx, %ecx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 12(%eax)
-; X86-BMI-NEXT:    movl %esi, 16(%eax)
-; X86-BMI-NEXT:    movl %edi, 20(%eax)
-; X86-BMI-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl %edi, %ecx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 16(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 20(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 24(%eax)
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 28(%eax)
 ; X86-BMI-NEXT:    jmp .LBB12_3
 ; X86-BMI-NEXT:  .LBB12_2: # %identity
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl %edx, (%eax)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
-; X86-BMI-NEXT:    movl %ecx, 8(%eax)
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    movl %ebp, 8(%eax)
+; X86-BMI-NEXT:    movl %ebx, 12(%eax)
+; X86-BMI-NEXT:    movl %edi, 16(%eax)
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl %ecx, 12(%eax)
-; X86-BMI-NEXT:    movl %esi, 16(%eax)
-; X86-BMI-NEXT:    movl %edi, 20(%eax)
-; X86-BMI-NEXT:    movl %ebx, 24(%eax)
-; X86-BMI-NEXT:    movl %ebp, 28(%eax)
+; X86-BMI-NEXT:    movl %ecx, 20(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, 24(%eax)
+; X86-BMI-NEXT:    movl %esi, 28(%eax)
 ; X86-BMI-NEXT:  .LBB12_3: # %identity
-; X86-BMI-NEXT:    addl $8, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -1592,75 +1549,64 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    je .LBB13_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl %esi, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl %eax, (%ebx)
-; X86-NEXT:    movl %edx, 4(%ebx)
-; X86-NEXT:    movl %ecx, 8(%ebx)
-; X86-NEXT:    movl %ebp, 12(%ebx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%ebx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%ebx)
-; X86-NEXT:    movl %edi, 24(%ebx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%ebx)
-; X86-NEXT:    jmp .LBB13_3
-; X86-NEXT:  .LBB13_2: # %identity
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %ecx, %ebp
+; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %ecx, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, (%ebx)
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%ebx)
-; X86-NEXT:    movl %ecx, 8(%ebx)
-; X86-NEXT:    movl %ebp, 12(%ebx)
-; X86-NEXT:    movl %eax, 16(%ebx)
-; X86-NEXT:    movl %edi, 20(%ebx)
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edx, 28(%eax)
+; X86-NEXT:    jmp .LBB13_3
+; X86-NEXT:  .LBB13_2: # %identity
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, 24(%ebx)
-; X86-NEXT:    movl %esi, 28(%ebx)
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl %esi, 28(%eax)
 ; X86-NEXT:  .LBB13_3: # %identity
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1673,75 +1619,64 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $12, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    je .LBB13_2
-; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movl %esi, %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, %edx
-; X86-SSE-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    movl %edi, %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    andl %esi, %edi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, %edx
-; X86-SSE-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, %eax
-; X86-SSE-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, %ebp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    andl %esi, %edx
+; X86-SSE-NEXT:    je .LBB13_2
+; X86-SSE-NEXT:  # %bb.1: # %mask
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, %eax
-; X86-SSE-NEXT:    movl %eax, (%ebx)
-; X86-SSE-NEXT:    movl %edx, 4(%ebx)
-; X86-SSE-NEXT:    movl %ecx, 8(%ebx)
-; X86-SSE-NEXT:    movl %ebp, 12(%ebx)
-; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SSE-NEXT:    movl %eax, 16(%ebx)
-; X86-SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE-NEXT:    movl %eax, 20(%ebx)
-; X86-SSE-NEXT:    movl %edi, 24(%ebx)
-; X86-SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE-NEXT:    movl %eax, 28(%ebx)
-; X86-SSE-NEXT:    jmp .LBB13_3
-; X86-SSE-NEXT:  .LBB13_2: # %identity
+; X86-SSE-NEXT:    andl %esi, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    notl %edx
+; X86-SSE-NEXT:    andl %edx, %ecx
+; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl %ecx, %ebp
+; X86-SSE-NEXT:    movl %ebp, 8(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl %ecx, %ebx
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 16(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    andl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, 20(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl %edx, (%ebx)
+; X86-SSE-NEXT:    andl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, 24(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl %edx, 4(%ebx)
-; X86-SSE-NEXT:    movl %ecx, 8(%ebx)
-; X86-SSE-NEXT:    movl %ebp, 12(%ebx)
-; X86-SSE-NEXT:    movl %eax, 16(%ebx)
-; X86-SSE-NEXT:    movl %edi, 20(%ebx)
+; X86-SSE-NEXT:    andl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, 28(%eax)
+; X86-SSE-NEXT:    jmp .LBB13_3
+; X86-SSE-NEXT:  .LBB13_2: # %identity
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    movl %ebp, 8(%eax)
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    movl %edi, 16(%eax)
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, 24(%ebx)
-; X86-SSE-NEXT:    movl %esi, 28(%ebx)
+; X86-SSE-NEXT:    movl %ecx, 20(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl %ecx, 24(%eax)
+; X86-SSE-NEXT:    movl %esi, 28(%eax)
 ; X86-SSE-NEXT:  .LBB13_3: # %identity
-; X86-SSE-NEXT:    movl %ebx, %eax
-; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -1773,60 +1708,53 @@ define <4 x i64> @and_sink_not_v4i64_swapped(<4 x i64> %x, <4 x i64> %m, i1 zero
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $8, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    je .LBB13_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %ebp, %esi, %esi
-; X86-BMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %ebx, %esi, %esi
-; X86-BMI-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %edi, %esi, %edi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %esi
-; X86-BMI-NEXT:    movl %ecx, %ebx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
+; X86-BMI-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI-NEXT:    movl %edx, (%eax)
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    andnl %ebx, %edx, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ebp, %ebp
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ebx, %ebx
-; X86-BMI-NEXT:    movl %ebx, (%eax)
-; X86-BMI-NEXT:    movl %ebp, 4(%eax)
-; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    andnl %ecx, %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl %ebp, %ecx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 8(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl %ebx, %ecx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 12(%eax)
-; X86-BMI-NEXT:    movl %esi, 16(%eax)
-; X86-BMI-NEXT:    movl %edi, 20(%eax)
-; X86-BMI-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl %edi, %ecx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 16(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 20(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 24(%eax)
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %ecx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 28(%eax)
 ; X86-BMI-NEXT:    jmp .LBB13_3
 ; X86-BMI-NEXT:  .LBB13_2: # %identity
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl %edx, (%eax)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
-; X86-BMI-NEXT:    movl %ecx, 8(%eax)
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    movl %ebp, 8(%eax)
+; X86-BMI-NEXT:    movl %ebx, 12(%eax)
+; X86-BMI-NEXT:    movl %edi, 16(%eax)
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl %ecx, 12(%eax)
-; X86-BMI-NEXT:    movl %esi, 16(%eax)
-; X86-BMI-NEXT:    movl %edi, 20(%eax)
-; X86-BMI-NEXT:    movl %ebx, 24(%eax)
-; X86-BMI-NEXT:    movl %ebp, 28(%eax)
+; X86-BMI-NEXT:    movl %ecx, 20(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, 24(%eax)
+; X86-BMI-NEXT:    movl %esi, 28(%eax)
 ; X86-BMI-NEXT:  .LBB13_3: # %identity
-; X86-BMI-NEXT:    addl $8, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -1868,112 +1796,96 @@ define <8 x i8> @and_sink_not_splat_v8i8(<8 x i8> %x, i8 %m, i1 zeroext %cond) n
 ; X86-LABEL: and_sink_not_splat_v8i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB14_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movb %dl, %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    notb %dl
-; X86-NEXT:    andb %dl, %ch
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    andb %dl, %ch
-; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    andb %dl, %ch
-; X86-NEXT:    andb %dl, %dh
-; X86-NEXT:    andb %dl, %bl
-; X86-NEXT:    andb %dl, %bh
-; X86-NEXT:    andb %dl, %cl
-; X86-NEXT:    andb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    movb %dl, (%eax)
-; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    notb %bh
+; X86-NEXT:    andb %bh, %ch
+; X86-NEXT:    movb %ch, (%eax)
+; X86-NEXT:    andb %bh, %dl
+; X86-NEXT:    movb %dl, 1(%eax)
+; X86-NEXT:    andb %bh, %cl
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    andb %bh, %bl
 ; X86-NEXT:    movb %bl, 3(%eax)
+; X86-NEXT:    andb %bh, %dh
 ; X86-NEXT:    movb %dh, 4(%eax)
-; X86-NEXT:    movb %ch, 5(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb %bh, %cl
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb %bh, %cl
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 7(%eax)
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %bh
 ; X86-NEXT:    jmp .LBB14_3
 ; X86-NEXT:  .LBB14_2: # %identity
 ; X86-NEXT:    movb %ch, (%eax)
-; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb %dl, 1(%eax)
+; X86-NEXT:    movb %cl, 2(%eax)
 ; X86-NEXT:    movb %bl, 3(%eax)
 ; X86-NEXT:    movb %dh, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movb %dl, 7(%eax)
 ; X86-NEXT:  .LBB14_3: # %identity
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movb %bh, 7(%eax)
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; X86-SSE-LABEL: and_sink_not_splat_v8i8:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx
-; X86-SSE-NEXT:    pushl %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    je .LBB14_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movb %dl, %ch
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-SSE-NEXT:    notb %dl
-; X86-SSE-NEXT:    andb %dl, %ch
-; X86-SSE-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    andb %dl, %ch
-; X86-SSE-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    andb %dl, %ch
-; X86-SSE-NEXT:    andb %dl, %dh
-; X86-SSE-NEXT:    andb %dl, %bl
-; X86-SSE-NEXT:    andb %dl, %bh
-; X86-SSE-NEXT:    andb %dl, %cl
-; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %dl
-; X86-SSE-NEXT:    movb %dl, (%eax)
-; X86-SSE-NEXT:    movb %cl, 1(%eax)
-; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT:    notb %bh
+; X86-SSE-NEXT:    andb %bh, %ch
+; X86-SSE-NEXT:    movb %ch, (%eax)
+; X86-SSE-NEXT:    andb %bh, %dl
+; X86-SSE-NEXT:    movb %dl, 1(%eax)
+; X86-SSE-NEXT:    andb %bh, %cl
+; X86-SSE-NEXT:    movb %cl, 2(%eax)
+; X86-SSE-NEXT:    andb %bh, %bl
 ; X86-SSE-NEXT:    movb %bl, 3(%eax)
+; X86-SSE-NEXT:    andb %bh, %dh
 ; X86-SSE-NEXT:    movb %dh, 4(%eax)
-; X86-SSE-NEXT:    movb %ch, 5(%eax)
-; X86-SSE-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    andb %bh, %cl
+; X86-SSE-NEXT:    movb %cl, 5(%eax)
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    andb %bh, %cl
 ; X86-SSE-NEXT:    movb %cl, 6(%eax)
-; X86-SSE-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE-NEXT:    movb %cl, 7(%eax)
+; X86-SSE-NEXT:    andb {{[0-9]+}}(%esp), %bh
 ; X86-SSE-NEXT:    jmp .LBB14_3
 ; X86-SSE-NEXT:  .LBB14_2: # %identity
 ; X86-SSE-NEXT:    movb %ch, (%eax)
-; X86-SSE-NEXT:    movb %cl, 1(%eax)
-; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movb %dl, 1(%eax)
+; X86-SSE-NEXT:    movb %cl, 2(%eax)
 ; X86-SSE-NEXT:    movb %bl, 3(%eax)
 ; X86-SSE-NEXT:    movb %dh, 4(%eax)
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movb %cl, 5(%eax)
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movb %cl, 6(%eax)
-; X86-SSE-NEXT:    movb %dl, 7(%eax)
 ; X86-SSE-NEXT:  .LBB14_3: # %identity
-; X86-SSE-NEXT:    addl $4, %esp
+; X86-SSE-NEXT:    movb %bh, 7(%eax)
 ; X86-SSE-NEXT:    popl %ebx
 ; X86-SSE-NEXT:    retl $4
 ;
@@ -1996,56 +1908,48 @@ define <8 x i8> @and_sink_not_splat_v8i8(<8 x i8> %x, i8 %m, i1 zeroext %cond) n
 ; X86-BMI-LABEL: and_sink_not_splat_v8i8:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %ebx
-; X86-BMI-NEXT:    pushl %eax
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-BMI-NEXT:    je .LBB14_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
-; X86-BMI-NEXT:    movb %dl, %ch
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI-NEXT:    notb %dl
-; X86-BMI-NEXT:    andb %dl, %ch
-; X86-BMI-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    andb %dl, %ch
-; X86-BMI-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    andb %dl, %ch
-; X86-BMI-NEXT:    andb %dl, %dh
-; X86-BMI-NEXT:    andb %dl, %bl
-; X86-BMI-NEXT:    andb %dl, %bh
-; X86-BMI-NEXT:    andb %dl, %cl
-; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %dl
-; X86-BMI-NEXT:    movb %dl, 1(%eax)
-; X86-BMI-NEXT:    movb %cl, (%eax)
-; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT:    notb %bh
+; X86-BMI-NEXT:    andb %bh, %ch
+; X86-BMI-NEXT:    movb %ch, 1(%eax)
+; X86-BMI-NEXT:    andb %bh, %dl
+; X86-BMI-NEXT:    movb %dl, (%eax)
+; X86-BMI-NEXT:    andb %bh, %cl
+; X86-BMI-NEXT:    movb %cl, 2(%eax)
+; X86-BMI-NEXT:    andb %bh, %bl
 ; X86-BMI-NEXT:    movb %bl, 3(%eax)
+; X86-BMI-NEXT:    andb %bh, %dh
 ; X86-BMI-NEXT:    movb %dh, 4(%eax)
-; X86-BMI-NEXT:    movb %ch, 5(%eax)
-; X86-BMI-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andb %bh, %cl
+; X86-BMI-NEXT:    movb %cl, 5(%eax)
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andb %bh, %cl
 ; X86-BMI-NEXT:    movb %cl, 6(%eax)
-; X86-BMI-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-BMI-NEXT:    movb %cl, 7(%eax)
+; X86-BMI-NEXT:    andb {{[0-9]+}}(%esp), %bh
 ; X86-BMI-NEXT:    jmp .LBB14_3
 ; X86-BMI-NEXT:  .LBB14_2: # %identity
-; X86-BMI-NEXT:    movb %cl, (%eax)
+; X86-BMI-NEXT:    movb %dl, (%eax)
 ; X86-BMI-NEXT:    movb %ch, 1(%eax)
-; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    movb %cl, 2(%eax)
 ; X86-BMI-NEXT:    movb %bl, 3(%eax)
 ; X86-BMI-NEXT:    movb %dh, 4(%eax)
 ; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movb %cl, 5(%eax)
 ; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movb %cl, 6(%eax)
-; X86-BMI-NEXT:    movb %dl, 7(%eax)
 ; X86-BMI-NEXT:  .LBB14_3: # %identity
-; X86-BMI-NEXT:    addl $4, %esp
+; X86-BMI-NEXT:    movb %bh, 7(%eax)
 ; X86-BMI-NEXT:    popl %ebx
 ; X86-BMI-NEXT:    retl $4
 ;
@@ -2092,116 +1996,100 @@ define <8 x i8> @and_sink_not_splat_v8i8_swapped(<8 x i8> %x, i8 %m, i1 zeroext
 ; X86-LABEL: and_sink_not_splat_v8i8_swapped:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB15_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    notb %ch
-; X86-NEXT:    andb %ch, %dh
-; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    andb %ch, %dh
-; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    andb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    andb %ch, %dh
-; X86-NEXT:    andb %ch, %bl
-; X86-NEXT:    andb %ch, %bh
-; X86-NEXT:    andb %ch, %cl
-; X86-NEXT:    andb %ch, %dl
-; X86-NEXT:    movb %dl, (%eax)
-; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    notb %bh
+; X86-NEXT:    andb %bh, %ch
+; X86-NEXT:    movb %ch, (%eax)
+; X86-NEXT:    andb %bh, %dl
+; X86-NEXT:    movb %dl, 1(%eax)
+; X86-NEXT:    andb %bh, %cl
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    andb %bh, %bl
 ; X86-NEXT:    movb %bl, 3(%eax)
+; X86-NEXT:    andb %bh, %dh
 ; X86-NEXT:    movb %dh, 4(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb %bh, %cl
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb %bh, %cl
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb %bh, %cl
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    jmp .LBB15_3
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB15_2: # %identity
-; X86-NEXT:    movb %dl, (%eax)
-; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movb %bh, 2(%eax)
+; X86-NEXT:    movb %ch, (%eax)
+; X86-NEXT:    movb %dl, 1(%eax)
+; X86-NEXT:    movb %cl, 2(%eax)
 ; X86-NEXT:    movb %bl, 3(%eax)
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movb %dh, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movb %dh, 7(%eax)
-; X86-NEXT:  .LBB15_3: # %identity
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movb %bh, 7(%eax)
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; X86-SSE-LABEL: and_sink_not_splat_v8i8_swapped:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %ebx
-; X86-SSE-NEXT:    pushl %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    je .LBB15_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-SSE-NEXT:    notb %ch
-; X86-SSE-NEXT:    andb %ch, %dh
-; X86-SSE-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    andb %ch, %dh
-; X86-SSE-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE-NEXT:    andb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-SSE-NEXT:    andb %ch, %dh
-; X86-SSE-NEXT:    andb %ch, %bl
-; X86-SSE-NEXT:    andb %ch, %bh
-; X86-SSE-NEXT:    andb %ch, %cl
-; X86-SSE-NEXT:    andb %ch, %dl
-; X86-SSE-NEXT:    movb %dl, (%eax)
-; X86-SSE-NEXT:    movb %cl, 1(%eax)
-; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-SSE-NEXT:    notb %bh
+; X86-SSE-NEXT:    andb %bh, %ch
+; X86-SSE-NEXT:    movb %ch, (%eax)
+; X86-SSE-NEXT:    andb %bh, %dl
+; X86-SSE-NEXT:    movb %dl, 1(%eax)
+; X86-SSE-NEXT:    andb %bh, %cl
+; X86-SSE-NEXT:    movb %cl, 2(%eax)
+; X86-SSE-NEXT:    andb %bh, %bl
 ; X86-SSE-NEXT:    movb %bl, 3(%eax)
+; X86-SSE-NEXT:    andb %bh, %dh
 ; X86-SSE-NEXT:    movb %dh, 4(%eax)
-; X86-SSE-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    andb %bh, %cl
 ; X86-SSE-NEXT:    movb %cl, 5(%eax)
-; X86-SSE-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    andb %bh, %cl
 ; X86-SSE-NEXT:    movb %cl, 6(%eax)
-; X86-SSE-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    andb %bh, %cl
 ; X86-SSE-NEXT:    movb %cl, 7(%eax)
-; X86-SSE-NEXT:    jmp .LBB15_3
+; X86-SSE-NEXT:    popl %ebx
+; X86-SSE-NEXT:    retl $4
 ; X86-SSE-NEXT:  .LBB15_2: # %identity
-; X86-SSE-NEXT:    movb %dl, (%eax)
-; X86-SSE-NEXT:    movb %cl, 1(%eax)
-; X86-SSE-NEXT:    movb %bh, 2(%eax)
+; X86-SSE-NEXT:    movb %ch, (%eax)
+; X86-SSE-NEXT:    movb %dl, 1(%eax)
+; X86-SSE-NEXT:    movb %cl, 2(%eax)
 ; X86-SSE-NEXT:    movb %bl, 3(%eax)
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movb %cl, 4(%eax)
+; X86-SSE-NEXT:    movb %dh, 4(%eax)
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movb %cl, 5(%eax)
 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movb %cl, 6(%eax)
-; X86-SSE-NEXT:    movb %dh, 7(%eax)
-; X86-SSE-NEXT:  .LBB15_3: # %identity
-; X86-SSE-NEXT:    addl $4, %esp
+; X86-SSE-NEXT:    movb %bh, 7(%eax)
 ; X86-SSE-NEXT:    popl %ebx
 ; X86-SSE-NEXT:    retl $4
 ;
@@ -2223,58 +2111,50 @@ define <8 x i8> @and_sink_not_splat_v8i8_swapped(<8 x i8> %x, i8 %m, i1 zeroext
 ; X86-BMI-LABEL: and_sink_not_splat_v8i8_swapped:
 ; X86-BMI:       # %bb.0:
 ; X86-BMI-NEXT:    pushl %ebx
-; X86-BMI-NEXT:    pushl %eax
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-BMI-NEXT:    je .LBB15_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-BMI-NEXT:    notb %ch
-; X86-BMI-NEXT:    andb %ch, %dh
-; X86-BMI-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    andb %ch, %dh
-; X86-BMI-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-BMI-NEXT:    andb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-BMI-NEXT:    andb %ch, %dh
-; X86-BMI-NEXT:    andb %ch, %bl
-; X86-BMI-NEXT:    andb %ch, %bh
-; X86-BMI-NEXT:    andb %ch, %dl
-; X86-BMI-NEXT:    andb %ch, %cl
-; X86-BMI-NEXT:    movb %cl, 1(%eax)
+; X86-BMI-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-BMI-NEXT:    notb %bh
+; X86-BMI-NEXT:    andb %bh, %ch
+; X86-BMI-NEXT:    movb %ch, 1(%eax)
+; X86-BMI-NEXT:    andb %bh, %dl
 ; X86-BMI-NEXT:    movb %dl, (%eax)
-; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    andb %bh, %cl
+; X86-BMI-NEXT:    movb %cl, 2(%eax)
+; X86-BMI-NEXT:    andb %bh, %bl
 ; X86-BMI-NEXT:    movb %bl, 3(%eax)
+; X86-BMI-NEXT:    andb %bh, %dh
 ; X86-BMI-NEXT:    movb %dh, 4(%eax)
-; X86-BMI-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andb %bh, %cl
 ; X86-BMI-NEXT:    movb %cl, 5(%eax)
-; X86-BMI-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andb %bh, %cl
 ; X86-BMI-NEXT:    movb %cl, 6(%eax)
-; X86-BMI-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    andb %bh, %cl
 ; X86-BMI-NEXT:    movb %cl, 7(%eax)
-; X86-BMI-NEXT:    jmp .LBB15_3
+; X86-BMI-NEXT:    popl %ebx
+; X86-BMI-NEXT:    retl $4
 ; X86-BMI-NEXT:  .LBB15_2: # %identity
 ; X86-BMI-NEXT:    movb %dl, (%eax)
-; X86-BMI-NEXT:    movb %cl, 1(%eax)
-; X86-BMI-NEXT:    movb %bh, 2(%eax)
+; X86-BMI-NEXT:    movb %ch, 1(%eax)
+; X86-BMI-NEXT:    movb %cl, 2(%eax)
 ; X86-BMI-NEXT:    movb %bl, 3(%eax)
-; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movb %cl, 4(%eax)
+; X86-BMI-NEXT:    movb %dh, 4(%eax)
 ; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movb %cl, 5(%eax)
 ; X86-BMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movb %cl, 6(%eax)
-; X86-BMI-NEXT:    movb %dh, 7(%eax)
-; X86-BMI-NEXT:  .LBB15_3: # %identity
-; X86-BMI-NEXT:    addl $4, %esp
+; X86-BMI-NEXT:    movb %bh, 7(%eax)
 ; X86-BMI-NEXT:    popl %ebx
 ; X86-BMI-NEXT:    retl $4
 ;
@@ -2332,18 +2212,21 @@ define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %con
 ; X86-NEXT:  # %bb.1: # %mask
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %ebx, %ecx
-; X86-NEXT:    andl %ebx, %edx
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    andl %ebx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    andl %ecx, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    jmp .LBB16_3
 ; X86-NEXT:  .LBB16_2: # %identity
 ; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:  .LBB16_3: # %identity
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:  .LBB16_3: # %identity
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -2428,14 +2311,19 @@ define <4 x i32> @and_sink_not_splat_v4i32(<4 x i32> %x, i32 %m, i1 zeroext %con
 ; X86-BMI-NEXT:    je .LBB16_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %ecx, %ebx, %ecx
-; X86-BMI-NEXT:    andnl %edx, %ebx, %edx
-; X86-BMI-NEXT:    andnl %esi, %ebx, %esi
 ; X86-BMI-NEXT:    andnl %edi, %ebx, %edi
+; X86-BMI-NEXT:    movl %edi, (%eax)
+; X86-BMI-NEXT:    andnl %esi, %ebx, %esi
+; X86-BMI-NEXT:    movl %esi, 4(%eax)
+; X86-BMI-NEXT:    andnl %edx, %ebx, %edx
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    andnl %ecx, %ebx, %ecx
+; X86-BMI-NEXT:    jmp .LBB16_3
 ; X86-BMI-NEXT:  .LBB16_2: # %identity
 ; X86-BMI-NEXT:    movl %edi, (%eax)
 ; X86-BMI-NEXT:    movl %esi, 4(%eax)
 ; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:  .LBB16_3: # %identity
 ; X86-BMI-NEXT:    movl %ecx, 12(%eax)
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
@@ -2493,14 +2381,19 @@ define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zero
 ; X86-NEXT:  # %bb.1: # %mask
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %ebx, %ecx
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    andl %ebx, %esi
 ; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    andl %ebx, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    andl %ebx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    andl %ebx, %ecx
+; X86-NEXT:    jmp .LBB17_3
 ; X86-NEXT:  .LBB17_2: # %identity
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:  .LBB17_3: # %identity
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2586,14 +2479,19 @@ define <4 x i32> @and_sink_not_splat_v4i32_swapped(<4 x i32> %x, i32 %m, i1 zero
 ; X86-BMI-NEXT:    je .LBB17_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI-NEXT:    andnl %ecx, %ebx, %ecx
-; X86-BMI-NEXT:    andnl %edx, %ebx, %edx
-; X86-BMI-NEXT:    andnl %esi, %ebx, %esi
 ; X86-BMI-NEXT:    andnl %edi, %ebx, %edi
+; X86-BMI-NEXT:    movl %edi, (%eax)
+; X86-BMI-NEXT:    andnl %esi, %ebx, %esi
+; X86-BMI-NEXT:    movl %esi, 4(%eax)
+; X86-BMI-NEXT:    andnl %edx, %ebx, %edx
+; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:    andnl %ecx, %ebx, %ecx
+; X86-BMI-NEXT:    jmp .LBB17_3
 ; X86-BMI-NEXT:  .LBB17_2: # %identity
 ; X86-BMI-NEXT:    movl %edi, (%eax)
 ; X86-BMI-NEXT:    movl %esi, 4(%eax)
 ; X86-BMI-NEXT:    movl %edx, 8(%eax)
+; X86-BMI-NEXT:  .LBB17_3: # %identity
 ; X86-BMI-NEXT:    movl %ecx, 12(%eax)
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
@@ -2642,56 +2540,50 @@ define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %con
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    je .LBB18_2
 ; X86-NEXT:  # %bb.1: # %mask
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %edx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    andl %edx, %edi
-; X86-NEXT:    andl %edx, %ebp
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    andl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    andl %edx, %ebx
 ; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    andl %ecx, %edi
 ; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %edx, %esi
 ; X86-NEXT:    movl %esi, 20(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    jmp .LBB18_3
 ; X86-NEXT:  .LBB18_2: # %identity
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ebp, 8(%eax)
 ; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    movl %edi, 16(%eax)
-; X86-NEXT:    movl %esi, 20(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl %edx, 28(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:  .LBB18_3: # %identity
-; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -2704,56 +2596,50 @@ define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %con
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $8, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    je .LBB18_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    notl %ecx
 ; X86-SSE-NEXT:    andl %ecx, %edx
-; X86-SSE-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE-NEXT:    andl %ecx, %esi
-; X86-SSE-NEXT:    andl %ecx, %ebx
-; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl %edx, (%eax)
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    notl %edx
-; X86-SSE-NEXT:    andl %edx, (%esp) # 4-byte Folded Spill
-; X86-SSE-NEXT:    andl %edx, %edi
-; X86-SSE-NEXT:    andl %edx, %ebp
-; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl %edx, (%eax)
-; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    andl %edx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    andl %ecx, %ebp
 ; X86-SSE-NEXT:    movl %ebp, 8(%eax)
+; X86-SSE-NEXT:    andl %edx, %ebx
 ; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    andl %ecx, %edi
 ; X86-SSE-NEXT:    movl %edi, 16(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    andl %edx, %esi
 ; X86-SSE-NEXT:    movl %esi, 20(%eax)
-; X86-SSE-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE-NEXT:    movl %ecx, 28(%eax)
+; X86-SSE-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 28(%eax)
 ; X86-SSE-NEXT:    jmp .LBB18_3
 ; X86-SSE-NEXT:  .LBB18_2: # %identity
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, (%eax)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
 ; X86-SSE-NEXT:    movl %ebp, 8(%eax)
 ; X86-SSE-NEXT:    movl %ebx, 12(%eax)
 ; X86-SSE-NEXT:    movl %edi, 16(%eax)
-; X86-SSE-NEXT:    movl %esi, 20(%eax)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE-NEXT:    movl %edx, 28(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 20(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 24(%eax)
+; X86-SSE-NEXT:    movl %ecx, 28(%eax)
 ; X86-SSE-NEXT:  .LBB18_3: # %identity
-; X86-SSE-NEXT:    addl $8, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -2783,52 +2669,47 @@ define <4 x i64> @and_sink_not_splat_v4i64(<4 x i64> %x, i64 %m, i1 zeroext %con
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $8, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    je .LBB18_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %ecx, %esi, %ecx
-; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    andnl %ebx, %esi, %ecx
-; X86-BMI-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    andnl %ebp, %esi, %ebp
 ; X86-BMI-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %ebx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %edi
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %ecx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %esi
-; X86-BMI-NEXT:    movl %esi, (%eax)
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
+; X86-BMI-NEXT:    movl %edx, (%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andnl %ecx, %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    andnl %ebp, %esi, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 8(%eax)
-; X86-BMI-NEXT:    movl %ebp, 12(%eax)
-; X86-BMI-NEXT:    movl %edi, 16(%eax)
-; X86-BMI-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    andnl %ebx, %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 12(%eax)
+; X86-BMI-NEXT:    andnl %edi, %esi, %ecx
+; X86-BMI-NEXT:    movl %ecx, 16(%eax)
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 20(%eax)
-; X86-BMI-NEXT:    movl %ebx, 24(%eax)
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %ecx
+; X86-BMI-NEXT:    movl %ecx, 24(%eax)
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 28(%eax)
 ; X86-BMI-NEXT:    jmp .LBB18_3
 ; X86-BMI-NEXT:  .LBB18_2: # %identity
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    movl %edi, (%eax)
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %edx, 8(%eax)
-; X86-BMI-NEXT:    movl %ebp, 12(%eax)
-; X86-BMI-NEXT:    movl %esi, 16(%eax)
-; X86-BMI-NEXT:    movl %ebx, 20(%eax)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %edx, 24(%eax)
+; X86-BMI-NEXT:    movl %edx, (%eax)
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    movl %ebp, 8(%eax)
+; X86-BMI-NEXT:    movl %ebx, 12(%eax)
+; X86-BMI-NEXT:    movl %edi, 16(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, 20(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, 24(%eax)
+; X86-BMI-NEXT:    movl %esi, 28(%eax)
 ; X86-BMI-NEXT:  .LBB18_3: # %identity
-; X86-BMI-NEXT:    movl %ecx, 28(%eax)
-; X86-BMI-NEXT:    addl $8, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
@@ -2878,59 +2759,51 @@ define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zero
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    je .LBB19_2
 ; X86-NEXT:  # %bb.1: # %mask
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    andl %esi, %ebp
-; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl %ecx, %edx
 ; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    andl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl %edi, 24(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    andl %edx, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    movl %esi, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    jmp .LBB19_3
 ; X86-NEXT:  .LBB19_2: # %identity
 ; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ebp, 8(%eax)
 ; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl %edi, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 24(%eax)
 ; X86-NEXT:  .LBB19_3: # %identity
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -2943,59 +2816,51 @@ define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zero
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SSE-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    je .LBB19_2
 ; X86-SSE-NEXT:  # %bb.1: # %mask
-; X86-SSE-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    andl %esi, (%esp) # 4-byte Folded Spill
-; X86-SSE-NEXT:    andl %esi, %edi
-; X86-SSE-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE-NEXT:    andl %esi, %ebx
-; X86-SSE-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE-NEXT:    andl %esi, %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    notl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    andl %esi, %edi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    andl %esi, %ebx
-; X86-SSE-NEXT:    andl %esi, %ebp
-; X86-SSE-NEXT:    andl %esi, %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    notl %ecx
+; X86-SSE-NEXT:    andl %ecx, %edx
 ; X86-SSE-NEXT:    movl %edx, (%eax)
-; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    notl %edx
+; X86-SSE-NEXT:    andl %edx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    andl %ecx, %ebp
 ; X86-SSE-NEXT:    movl %ebp, 8(%eax)
-; X86-SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE-NEXT:    movl %ebx, 16(%eax)
-; X86-SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE-NEXT:    movl %edi, 24(%eax)
-; X86-SSE-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SSE-NEXT:    movl %ecx, 28(%eax)
+; X86-SSE-NEXT:    andl %edx, %ebx
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    andl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 16(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    andl %edx, %esi
+; X86-SSE-NEXT:    movl %esi, 20(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    andl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, 24(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    andl %edx, %ecx
 ; X86-SSE-NEXT:    jmp .LBB19_3
 ; X86-SSE-NEXT:  .LBB19_2: # %identity
 ; X86-SSE-NEXT:    movl %edx, (%eax)
-; X86-SSE-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
 ; X86-SSE-NEXT:    movl %ebp, 8(%eax)
 ; X86-SSE-NEXT:    movl %ebx, 12(%eax)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE-NEXT:    movl %edi, 20(%eax)
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE-NEXT:    movl %esi, 28(%eax)
+; X86-SSE-NEXT:    movl %edi, 16(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 20(%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl %edx, 24(%eax)
 ; X86-SSE-NEXT:  .LBB19_3: # %identity
-; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    movl %ecx, 28(%eax)
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -3025,52 +2890,47 @@ define <4 x i64> @and_sink_not_splat_v4i64_swapped(<4 x i64> %x, i64 %m, i1 zero
 ; X86-BMI-NEXT:    pushl %ebx
 ; X86-BMI-NEXT:    pushl %edi
 ; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    subl $8, %esp
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-BMI-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    je .LBB19_2
 ; X86-BMI-NEXT:  # %bb.1: # %mask
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl %ecx, %esi, %ecx
-; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    andnl %ebx, %esi, %ecx
-; X86-BMI-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-BMI-NEXT:    andnl %ebp, %esi, %ebp
 ; X86-BMI-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %ebx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %edi
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %ecx
-; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %esi
-; X86-BMI-NEXT:    movl %esi, (%eax)
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
+; X86-BMI-NEXT:    movl %edx, (%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    andnl %ecx, %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    andnl %ebp, %esi, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 8(%eax)
-; X86-BMI-NEXT:    movl %ebp, 12(%eax)
-; X86-BMI-NEXT:    movl %edi, 16(%eax)
-; X86-BMI-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    andnl %ebx, %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 12(%eax)
+; X86-BMI-NEXT:    andnl %edi, %esi, %ecx
+; X86-BMI-NEXT:    movl %ecx, 16(%eax)
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %ecx
 ; X86-BMI-NEXT:    movl %ecx, 20(%eax)
-; X86-BMI-NEXT:    movl %ebx, 24(%eax)
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %esi, %ecx
+; X86-BMI-NEXT:    movl %ecx, 24(%eax)
+; X86-BMI-NEXT:    andnl {{[0-9]+}}(%esp), %edx, %ecx
+; X86-BMI-NEXT:    movl %ecx, 28(%eax)
 ; X86-BMI-NEXT:    jmp .LBB19_3
 ; X86-BMI-NEXT:  .LBB19_2: # %identity
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    movl %edi, (%eax)
-; X86-BMI-NEXT:    movl %edx, 4(%eax)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %edx, 8(%eax)
-; X86-BMI-NEXT:    movl %ebp, 12(%eax)
-; X86-BMI-NEXT:    movl %esi, 16(%eax)
-; X86-BMI-NEXT:    movl %ebx, 20(%eax)
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %edx, 24(%eax)
+; X86-BMI-NEXT:    movl %edx, (%eax)
+; X86-BMI-NEXT:    movl %ecx, 4(%eax)
+; X86-BMI-NEXT:    movl %ebp, 8(%eax)
+; X86-BMI-NEXT:    movl %ebx, 12(%eax)
+; X86-BMI-NEXT:    movl %edi, 16(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, 20(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, 24(%eax)
+; X86-BMI-NEXT:    movl %esi, 28(%eax)
 ; X86-BMI-NEXT:  .LBB19_3: # %identity
-; X86-BMI-NEXT:    movl %ecx, 28(%eax)
-; X86-BMI-NEXT:    addl $8, %esp
 ; X86-BMI-NEXT:    popl %esi
 ; X86-BMI-NEXT:    popl %edi
 ; X86-BMI-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/apx/i386-ndd.ll b/llvm/test/CodeGen/X86/apx/i386-ndd.ll
index 146a99340eb66..900a7355115a4 100644
--- a/llvm/test/CodeGen/X86/apx/i386-ndd.ll
+++ b/llvm/test/CodeGen/X86/apx/i386-ndd.ll
@@ -3,9 +3,9 @@
 define i32 @test(i1 %cmp, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    cmovnel %eax, %ecx
 ; CHECK-NEXT:    movl (%ecx), %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index f13627b55856f..d50541a958561 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -188,25 +188,25 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $48, %esp
-; CHECK-NEXT:    movl 24(%ebp), %eax
+; CHECK-NEXT:    movl 36(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 28(%ebp), %ebx
-; CHECK-NEXT:    movl 32(%ebp), %esi
-; CHECK-NEXT:    movl 36(%ebp), %edi
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 32(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 24(%ebp), %ebx
+; CHECK-NEXT:    movl 28(%ebp), %esi
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    calll _addrof_i128
 ; CHECK-NEXT:    addl $4, %esp
 ; CHECK-NEXT:    movl 8(%ebp), %eax
-; CHECK-NEXT:    movl %edi, 12(%eax)
-; CHECK-NEXT:    movl %esi, 8(%eax)
-; CHECK-NEXT:    movl %ebx, 4(%eax)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl %edi, 8(%eax)
+; CHECK-NEXT:    movl %esi, 4(%eax)
+; CHECK-NEXT:    movl %ebx, (%eax)
 ; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
@@ -332,10 +332,10 @@ define void @escape_with_store(i32 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %esp, %ecx
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, (%esp)
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    calll _addrof_i32
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence.ll b/llvm/test/CodeGen/X86/arithmetic_fence.ll
index e167601e0e6a2..9804cb1a7ee94 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence.ll
@@ -170,30 +170,20 @@ define <2 x float> @f6(<2 x float> %a) {
 define dso_local fp128 @foo(ptr nocapture readonly %qr) local_unnamed_addr{
 ; X86-LABEL: foo:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl 12(%ecx), %edx
-; X86-NEXT:    movl 8(%ecx), %esi
-; X86-NEXT:    movl (%ecx), %edi
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    #ARITH_FENCE
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl 8(%ecx), %edx
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
 ; X86-NEXT:    #ARITH_FENCE
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/arithmetic_fence2.ll b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
index 3c2ef21527f50..f807cac826626 100644
--- a/llvm/test/CodeGen/X86/arithmetic_fence2.ll
+++ b/llvm/test/CodeGen/X86/arithmetic_fence2.ll
@@ -130,12 +130,12 @@ define <8 x float> @f6(<8 x float> %a) {
 ; X86-LABEL: f6:
 ; X86:       # %bb.0:
 ; X86-NEXT:    addps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, %xmm2
+; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    addps %xmm2, %xmm0
 ; X86-NEXT:    addps %xmm1, %xmm1
 ; X86-NEXT:    movaps %xmm1, %xmm2
 ; X86-NEXT:    #ARITH_FENCE
-; X86-NEXT:    movaps %xmm0, %xmm3
-; X86-NEXT:    #ARITH_FENCE
-; X86-NEXT:    addps %xmm3, %xmm0
 ; X86-NEXT:    addps %xmm2, %xmm1
 ; X86-NEXT:    retl
 ;
@@ -194,10 +194,11 @@ define <2 x half> @f9(<2 x half> %a) nounwind {
 ; X86-LABEL: f9:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movdqa %xmm0, %xmm1
-; X86-NEXT:    psrld $16, %xmm1
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    psrld $16, %xmm0
 ; X86-NEXT:    #ARITH_FENCE
-; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: f9:
@@ -215,19 +216,20 @@ define <2 x half> @f9(<2 x half> %a) nounwind {
 define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind {
 ; X86-LABEL: f10:
 ; X86:       # %bb.0:
-; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movdqa %xmm0, %xmm1
-; X86-NEXT:    psrld $16, %xmm1
-; X86-NEXT:    pextrw $0, %xmm1, %ecx
-; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    pextrw $0, %xmm0, %edx
+; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm0
+; X86-NEXT:    movdqa %xmm1, %xmm2
+; X86-NEXT:    psrld $16, %xmm2
+; X86-NEXT:    pextrw $0, %xmm2, %eax
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm2
+; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-NEXT:    pextrw $0, %xmm1, %eax
 ; X86-NEXT:    #ARITH_FENCE
-; X86-NEXT:    pinsrw $0, %eax, %xmm0
-; X86-NEXT:    pinsrw $0, %ecx, %xmm1
-; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-NEXT:    pinsrw $0, %edx, %xmm1
+; X86-NEXT:    pinsrw $0, %eax, %xmm1
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
 ;
@@ -255,28 +257,27 @@ define <3 x bfloat> @f10(<3 x bfloat> %a) nounwind {
 define <4 x bfloat> @f11(<4 x bfloat> %a) nounwind {
 ; X86-LABEL: f11:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrlq $48, %xmm1
 ; X86-NEXT:    pextrw $0, %xmm1, %eax
-; X86-NEXT:    movdqa %xmm0, %xmm1
-; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; X86-NEXT:    pextrw $0, %xmm1, %edx
-; X86-NEXT:    pextrw $0, %xmm0, %ecx
-; X86-NEXT:    psrld $16, %xmm0
-; X86-NEXT:    pextrw $0, %xmm0, %esi
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm1
+; X86-NEXT:    movdqa %xmm0, %xmm2
+; X86-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; X86-NEXT:    pextrw $0, %xmm2, %eax
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm2
+; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    #ARITH_FENCE
+; X86-NEXT:    pinsrw $0, %eax, %xmm1
+; X86-NEXT:    psrld $16, %xmm0
+; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    #ARITH_FENCE
 ; X86-NEXT:    pinsrw $0, %eax, %xmm0
-; X86-NEXT:    pinsrw $0, %edx, %xmm1
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-NEXT:    pinsrw $0, %ecx, %xmm0
-; X86-NEXT:    pinsrw $0, %esi, %xmm2
-; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT:    popl %esi
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: f11:
diff --git a/llvm/test/CodeGen/X86/atom-fixup-lea3.ll b/llvm/test/CodeGen/X86/atom-fixup-lea3.ll
index d4c010e391024..54e249bf5d315 100644
--- a/llvm/test/CodeGen/X86/atom-fixup-lea3.ll
+++ b/llvm/test/CodeGen/X86/atom-fixup-lea3.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
 ; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4)
 ; CHECK-NEXT: movl
@@ -23,6 +24,53 @@
 ;}
 
 define i32 @test(i32 %n, ptr nocapture %array, ptr nocapture %k, ptr nocapture %l, ptr nocapture %m, ptr nocapture %array2) #0 {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    .cfi_offset %esi, -20
+; CHECK-NEXT:    .cfi_offset %edi, -16
+; CHECK-NEXT:    .cfi_offset %ebx, -12
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    jle .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %for.body.lr.ph
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl (%esi), %ebx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_3: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %ebx, (%esi)
+; CHECK-NEXT:    addl (%edx,%ebp,4), %ebx
+; CHECK-NEXT:    movl %ebx, (%esi)
+; CHECK-NEXT:    addl 4(%edi,%ebp,4), %eax
+; CHECK-NEXT:    leal 1(%ebp), %ebp
+; CHECK-NEXT:    cmpl %ebp, %ecx
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB0_4: # %for.end
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
 entry:
   %cmp7 = icmp sgt i32 %n, 0
   br i1 %cmp7, label %for.body.lr.ph, label %for.end
diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll
index b06bef44a5e9e..1381f53badfa4 100644
--- a/llvm/test/CodeGen/X86/atomic-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll
@@ -103,9 +103,9 @@ define i64 @bts63() nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    movl v64+4, %edx
 ; X86-NEXT:    movl v64, %eax
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -227,9 +227,9 @@ define i64 @btc63() nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    movl v64+4, %edx
 ; X86-NEXT:    movl v64, %eax
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -356,10 +356,10 @@ define i64 @btr63() nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    movl $-1, %edi
 ; X86-NEXT:    movl v64+4, %edx
 ; X86-NEXT:    movl v64, %eax
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    movl $-1, %edi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB14_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -407,8 +407,8 @@ define i16 @multi_use1() nounwind {
 ; X86-NEXT:    jne .LBB15_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    xorl $2, %eax
+; X86-NEXT:    xorl $2, %ecx
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
@@ -572,12 +572,11 @@ if.end:                                           ; preds = %entry
 define i32 @split_hoist_and(i32 %0) nounwind {
 ; X86-LABEL: split_hoist_and:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btsl $3, v32
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: split_hoist_and:
diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll
index 2dee1d12e7255..a9489b1136b80 100644
--- a/llvm/test/CodeGen/X86/atomic-fp.ll
+++ b/llvm/test/CodeGen/X86/atomic-fp.ll
@@ -101,8 +101,8 @@ define dso_local void @fadd_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
-; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -648,9 +648,9 @@ define dso_local void @fadd_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -808,8 +808,8 @@ define dso_local void @fsub_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
-; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -1362,9 +1362,9 @@ define dso_local void @fsub_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -1522,8 +1522,8 @@ define dso_local void @fmul_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
-; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -2066,9 +2066,9 @@ define dso_local void @fmul_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -2226,8 +2226,8 @@ define dso_local void @fdiv_64r(ptr %loc, double %val) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
-; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -2774,9 +2774,9 @@ define dso_local void @fdiv_array(ptr %arg, double %arg1, i64 %arg2) nounwind {
 ; X86-SSE1-NEXT:    movl %esp, %ebp
 ; X86-SSE1-NEXT:    andl $-8, %esp
 ; X86-SSE1-NEXT:    subl $16, %esp
+; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE1-NEXT:    movl 8(%ebp), %ecx
-; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
index 9e20fdb59f552..507b2bf28ef19 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent-syncscope.ll
@@ -18,12 +18,12 @@ define i8 @add8(ptr %p) #0 {
 ; X64-NEXT:    movzbl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-GENERIC-LABEL: add8:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    #MEMBARRIER
-; X86-GENERIC-NEXT:    movzbl (%eax), %eax
-; X86-GENERIC-NEXT:    retl
+; X86-SSE2-LABEL: add8:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    #MEMBARRIER
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
@@ -46,12 +46,12 @@ define i16 @or16(ptr %p) #0 {
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-GENERIC-LABEL: or16:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    #MEMBARRIER
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    retl
+; X86-SSE2-LABEL: or16:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    #MEMBARRIER
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzwl (%eax), %eax
+; X86-SSE2-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: or16:
 ; X86-ATOM:       # %bb.0:
@@ -74,12 +74,12 @@ define i32 @xor32(ptr %p) #0 {
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-GENERIC-LABEL: xor32:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    #MEMBARRIER
-; X86-GENERIC-NEXT:    movl (%eax), %eax
-; X86-GENERIC-NEXT:    retl
+; X86-SSE2-LABEL: xor32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    #MEMBARRIER
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: xor32:
 ; X86-ATOM:       # %bb.0:
@@ -254,12 +254,12 @@ define i32 @and32 (ptr %p) #0 {
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    retq
 ;
-; X86-GENERIC-LABEL: and32:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    #MEMBARRIER
-; X86-GENERIC-NEXT:    movl (%eax), %eax
-; X86-GENERIC-NEXT:    retl
+; X86-SSE2-LABEL: and32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    #MEMBARRIER
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    retl
 ;
 ; X86-ATOM-LABEL: and32:
 ; X86-ATOM:       # %bb.0:
@@ -612,4 +612,3 @@ attributes #0 = { nounwind }
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; X86-SLM: {{.*}}
-; X86-SSE2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
index 01c3e7999a92c..2fd2a50ca0db1 100644
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -20,18 +20,11 @@ define i8 @add8(ptr %p) #0 {
 ;
 ; X86-SSE2-LABEL: add8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: add8:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movzbl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: add8:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -53,18 +46,11 @@ define i16 @or16(ptr %p) #0 {
 ;
 ; X86-SSE2-LABEL: or16:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzwl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: or16:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movzwl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: or16:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -86,18 +72,11 @@ define i32 @xor32(ptr %p) #0 {
 ;
 ; X86-SSE2-LABEL: xor32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: xor32:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: xor32:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -271,18 +250,11 @@ define i32 @and32 (ptr %p) #0 {
 ;
 ; X86-SSE2-LABEL: and32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: and32:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: and32:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -631,18 +603,11 @@ define void @atomic_umin_uint_max(ptr %addr) #0 {
 ;
 ; X86-SSE2-LABEL: atomic_umin_uint_max:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: atomic_umin_uint_max:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: atomic_umin_uint_max:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -664,18 +629,11 @@ define void @atomic_umax_zero(ptr %addr) #0 {
 ;
 ; X86-SSE2-LABEL: atomic_umax_zero:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: atomic_umax_zero:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: atomic_umax_zero:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -697,18 +655,11 @@ define void @atomic_min_smax_char(ptr %addr) #0 {
 ;
 ; X86-SSE2-LABEL: atomic_min_smax_char:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: atomic_min_smax_char:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movzbl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: atomic_min_smax_char:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -730,18 +681,11 @@ define void @atomic_max_smin_char(ptr %addr) #0 {
 ;
 ; X86-SSE2-LABEL: atomic_max_smin_char:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: atomic_max_smin_char:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movzbl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: atomic_max_smin_char:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -763,18 +707,11 @@ define void @atomic_min_umax_char(ptr %addr) #0 {
 ;
 ; X86-SSE2-LABEL: atomic_min_umax_char:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: atomic_min_umax_char:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movzbl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: atomic_min_umax_char:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -796,18 +733,11 @@ define void @atomic_max_umin_char(ptr %addr) #0 {
 ;
 ; X86-SSE2-LABEL: atomic_max_umin_char:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    mfence
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl (%eax), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-SLM-LABEL: atomic_max_umin_char:
-; X86-SLM:       # %bb.0:
-; X86-SLM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLM-NEXT:    lock orl $0, (%esp)
-; X86-SLM-NEXT:    movzbl (%eax), %eax
-; X86-SLM-NEXT:    retl
-;
 ; X86-ATOM-LABEL: atomic_max_umin_char:
 ; X86-ATOM:       # %bb.0:
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
index 1b73a41439688..d6880ff9b0608 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store-wide.ll
@@ -8,8 +8,8 @@
 define void @test1(ptr %ptr, i64 %val1) {
 ; SSE42-LABEL: test1:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE42-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE42-NEXT:    movlps %xmm0, (%eax)
 ; SSE42-NEXT:    lock orl $0, (%esp)
 ; SSE42-NEXT:    retl
@@ -23,11 +23,11 @@ define void @test1(ptr %ptr, i64 %val1) {
 ; NOSSE-NEXT:    .cfi_def_cfa_register %ebp
 ; NOSSE-NEXT:    andl $-8, %esp
 ; NOSSE-NEXT:    subl $8, %esp
+; NOSSE-NEXT:    movl 12(%ebp), %eax
+; NOSSE-NEXT:    movl 16(%ebp), %ecx
+; NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    movl %eax, (%esp)
 ; NOSSE-NEXT:    movl 8(%ebp), %eax
-; NOSSE-NEXT:    movl 12(%ebp), %ecx
-; NOSSE-NEXT:    movl 16(%ebp), %edx
-; NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl %ecx, (%esp)
 ; NOSSE-NEXT:    fildll (%esp)
 ; NOSSE-NEXT:    fistpll (%eax)
 ; NOSSE-NEXT:    lock orl $0, (%esp)
diff --git a/llvm/test/CodeGen/X86/atomic-mi.ll b/llvm/test/CodeGen/X86/atomic-mi.ll
index 0d0108f55f2ab..aeb0edf5a028d 100644
--- a/llvm/test/CodeGen/X86/atomic-mi.ll
+++ b/llvm/test/CodeGen/X86/atomic-mi.ll
@@ -91,9 +91,9 @@ define void @store_atomic_imm_64(ptr %p) {
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
 ; X32-NEXT:    andl $-8, %esp
 ; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl $42, (%esp)
+; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
 ; X32-NEXT:    movl %ebp, %esp
@@ -124,9 +124,9 @@ define void @store_atomic_imm_64_big(ptr %p) {
 ; X32-NEXT:    .cfi_def_cfa_register %ebp
 ; X32-NEXT:    andl $-8, %esp
 ; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    movl $23, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl $1215752192, (%esp) # imm = 0x4876E800
+; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
 ; X32-NEXT:    movl %ebp, %esp
@@ -332,8 +332,8 @@ define void @add_64i(ptr %p) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl $2, %ecx
-; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -369,8 +369,8 @@ define void @add_64r(ptr %p, i64 %v) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl 12(%ebp), %ecx
-; X32-NEXT:    adcl 16(%ebp), %edx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    adcl 16(%ebp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -552,8 +552,8 @@ define void @sub_64r(ptr %p, i64 %v) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    subl 12(%ebp), %ecx
-; X32-NEXT:    sbbl 16(%ebp), %edx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    sbbl 16(%ebp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -751,9 +751,9 @@ define void @and_64r(ptr %p, i64 %v) {
 ; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    andl 16(%ebp), %edx
 ; X32-NEXT:    andl 12(%ebp), %ecx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    andl 16(%ebp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -937,8 +937,8 @@ define void @or_64i(ptr %p) {
 ; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    orl $2, %ecx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    orl $2, %ecx
 ; X32-NEXT:    movl %ecx, (%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -973,9 +973,9 @@ define void @or_64r(ptr %p, i64 %v) {
 ; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    orl 16(%ebp), %edx
 ; X32-NEXT:    orl 12(%ebp), %ecx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    orl 16(%ebp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -1159,8 +1159,8 @@ define void @xor_64i(ptr %p) {
 ; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl $2, %ecx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    xorl $2, %ecx
 ; X32-NEXT:    movl %ecx, (%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -1195,9 +1195,9 @@ define void @xor_64r(ptr %p, i64 %v) {
 ; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl 16(%ebp), %edx
 ; X32-NEXT:    xorl 12(%ebp), %ecx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    xorl 16(%ebp), %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -1343,8 +1343,8 @@ define void @inc_64(ptr %p) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl $1, %ecx
-; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -1481,8 +1481,8 @@ define void @dec_64(ptr %p) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    addl $-1, %ecx
-; X32-NEXT:    adcl $-1, %edx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    adcl $-1, %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -1603,9 +1603,9 @@ define void @not_64(ptr %p) {
 ; X32-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    notl %edx
 ; X32-NEXT:    notl %ecx
 ; X32-NEXT:    movl %ecx, (%esp)
+; X32-NEXT:    notl %edx
 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
@@ -1719,8 +1719,8 @@ define void @neg_64(ptr %p) {
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll (%esp)
 ; X32-NEXT:    fistpll (%eax)
diff --git a/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll b/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll
index 362135cb1808b..da63bd0fa9462 100644
--- a/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll
+++ b/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -235,9 +235,9 @@ define void @tf_bug(ptr %ptr) nounwind {
 ; LINUX:       # %bb.0: # %entry
 ; LINUX-NEXT:    pushl %ebx
 ; LINUX-NEXT:    pushl %esi
-; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; LINUX-NEXT:    movl id+4, %edx
 ; LINUX-NEXT:    movl id, %eax
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; LINUX-NEXT:    .p2align 4
 ; LINUX-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LINUX-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -249,8 +249,8 @@ define void @tf_bug(ptr %ptr) nounwind {
 ; LINUX-NEXT:    jne .LBB4_1
 ; LINUX-NEXT:  # %bb.2: # %atomicrmw.end
 ; LINUX-NEXT:    addl $1, %eax
-; LINUX-NEXT:    adcl $0, %edx
 ; LINUX-NEXT:    movl %eax, (%esi)
+; LINUX-NEXT:    adcl $0, %edx
 ; LINUX-NEXT:    movl %edx, 4(%esi)
 ; LINUX-NEXT:    popl %esi
 ; LINUX-NEXT:    popl %ebx
@@ -264,9 +264,9 @@ define void @tf_bug(ptr %ptr) nounwind {
 ; PIC-NEXT:    calll L4$pb
 ; PIC-NEXT:  L4$pb:
 ; PIC-NEXT:    popl %edi
-; PIC-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; PIC-NEXT:    movl _id-L4$pb+4(%edi), %edx
 ; PIC-NEXT:    movl _id-L4$pb(%edi), %eax
+; PIC-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; PIC-NEXT:    .p2align 4
 ; PIC-NEXT:  LBB4_1: ## %atomicrmw.start
 ; PIC-NEXT:    ## =>This Inner Loop Header: Depth=1
@@ -278,8 +278,8 @@ define void @tf_bug(ptr %ptr) nounwind {
 ; PIC-NEXT:    jne LBB4_1
 ; PIC-NEXT:  ## %bb.2: ## %atomicrmw.end
 ; PIC-NEXT:    addl $1, %eax
-; PIC-NEXT:    adcl $0, %edx
 ; PIC-NEXT:    movl %eax, (%esi)
+; PIC-NEXT:    adcl $0, %edx
 ; PIC-NEXT:    movl %edx, 4(%esi)
 ; PIC-NEXT:    popl %esi
 ; PIC-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
index ed882cbbc5254..45b54381c4f60 100644
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -16,33 +16,12 @@
 ;  and their calling convention which remain unresolved.)
 
 define void @store_half(ptr %fptr, half %v) {
-; X86-SSE1-LABEL: store_half:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movw %ax, (%ecx)
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: store_half:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movw %cx, (%eax)
-; X86-SSE2-NEXT:    retl
-;
-; X86-AVX-LABEL: store_half:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movw %cx, (%eax)
-; X86-AVX-NEXT:    retl
-;
-; X86-NOSSE-LABEL: store_half:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movw %ax, (%ecx)
-; X86-NOSSE-NEXT:    retl
+; X86-LABEL: store_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: store_half:
 ; X64-SSE:       # %bb.0:
@@ -64,7 +43,7 @@ define void @store_float(ptr %fptr, float %v) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: store_float:
@@ -83,23 +62,23 @@ define void @store_float(ptr %fptr, float %v) {
 define void @store_double(ptr %fptr, double %v) {
 ; X86-SSE1-LABEL: store_double:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: store_double:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: store_double:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -109,9 +88,9 @@ define void @store_double(ptr %fptr, double %v) {
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    fildll (%esp)
 ; X86-NOSSE-NEXT:    fistpll (%eax)
 ; X86-NOSSE-NEXT:    addl $12, %esp
@@ -240,8 +219,8 @@ define double @load_double(ptr %fptr) {
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    subl $12, %esp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -436,10 +415,10 @@ define double @exchange_double(ptr %fptr, double %x) {
 ; X86-SSE1-NEXT:    .cfi_offset %esi, -12
 ; X86-SSE1-NEXT:    .cfi_offset %ebx, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    movl (%esi), %eax
 ; X86-SSE1-NEXT:    movl 4(%esi), %edx
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    .p2align 4
 ; X86-SSE1-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-SSE1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -468,10 +447,10 @@ define double @exchange_double(ptr %fptr, double %x) {
 ; X86-SSE2-NEXT:    .cfi_offset %esi, -12
 ; X86-SSE2-NEXT:    .cfi_offset %ebx, -8
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl (%esi), %eax
 ; X86-SSE2-NEXT:    movl 4(%esi), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    .p2align 4
 ; X86-SSE2-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -502,10 +481,10 @@ define double @exchange_double(ptr %fptr, double %x) {
 ; X86-AVX-NEXT:    .cfi_offset %esi, -12
 ; X86-AVX-NEXT:    .cfi_offset %ebx, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl (%esi), %eax
 ; X86-AVX-NEXT:    movl 4(%esi), %edx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    .p2align 4
 ; X86-AVX-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-AVX-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -535,10 +514,10 @@ define double @exchange_double(ptr %fptr, double %x) {
 ; X86-NOSSE-NEXT:    .cfi_offset %esi, -12
 ; X86-NOSSE-NEXT:    .cfi_offset %ebx, -8
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl (%esi), %eax
 ; X86-NOSSE-NEXT:    movl 4(%esi), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    .p2align 4
 ; X86-NOSSE-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-NOSSE-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -603,25 +582,25 @@ define void @store_float_seq_cst(ptr %fptr, float %v) {
 define void @store_double_seq_cst(ptr %fptr, double %v) {
 ; X86-SSE1-LABEL: store_double_seq_cst:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE1-NEXT:    lock orl $0, (%esp)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: store_double_seq_cst:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE2-NEXT:    lock orl $0, (%esp)
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: store_double_seq_cst:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-AVX-NEXT:    lock orl $0, (%esp)
 ; X86-AVX-NEXT:    retl
@@ -632,9 +611,9 @@ define void @store_double_seq_cst(ptr %fptr, double %v) {
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %ecx, (%esp)
+; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    fildll (%esp)
 ; X86-NOSSE-NEXT:    fistpll (%eax)
 ; X86-NOSSE-NEXT:    lock orl $0, (%esp)
@@ -724,8 +703,8 @@ define double @load_double_seq_cst(ptr %fptr) {
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    subl $12, %esp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
@@ -809,16 +788,16 @@ define void @store_bfloat(ptr %fptr, bfloat %v) {
 ;
 ; X86-SSE2-LABEL: store_bfloat:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movw %cx, (%eax)
+; X86-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movw %ax, (%ecx)
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: store_bfloat:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movw %cx, (%eax)
+; X86-AVX-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movw %ax, (%ecx)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-NOSSE-LABEL: store_bfloat:
diff --git a/llvm/test/CodeGen/X86/atomic-op.ll b/llvm/test/CodeGen/X86/atomic-op.ll
index 5bdc60b031179..b8822d6e5256f 100644
--- a/llvm/test/CodeGen/X86/atomic-op.ll
+++ b/llvm/test/CodeGen/X86/atomic-op.ll
@@ -1,8 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-- -mattr=+cmov,cx16 -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
 define void @func(i32 %argc, ptr %argv) nounwind {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $48, %esp
+; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $31, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $3855, {{[0-9]+}}(%esp) # imm = 0xF0F
+; CHECK-NEXT:    movl $3855, {{[0-9]+}}(%esp) # imm = 0xF0F
+; CHECK-NEXT:    movl $3855, {{[0-9]+}}(%esp) # imm = 0xF0F
+; CHECK-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    lock xaddl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $-30, %eax
+; CHECK-NEXT:    lock xaddl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    lock xaddl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    lock xaddl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start68
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andl $4080, %ecx # imm = 0xFF0
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end67
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start62
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    orl $4080, %ecx # imm = 0xFF0
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  # %bb.4: # %atomicrmw.end61
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start56
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xorl $4080, %ecx # imm = 0xFF0
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.end55
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start50
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpl $17, %eax
+; CHECK-NEXT:    movl $16, %ecx
+; CHECK-NEXT:    cmovll %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_7
+; CHECK-NEXT:  # %bb.8: # %atomicrmw.end49
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_9: # %atomicrmw.start44
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpl %ecx, %eax
+; CHECK-NEXT:    movl $-1, %edx
+; CHECK-NEXT:    cmovlel %eax, %edx
+; CHECK-NEXT:    lock cmpxchgl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.10: # %atomicrmw.end43
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_11: # %atomicrmw.start38
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    cmovgl %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_11
+; CHECK-NEXT:  # %bb.12: # %atomicrmw.end37
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_13: # %atomicrmw.start32
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movl $0, %ecx
+; CHECK-NEXT:    cmovnsl %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_13
+; CHECK-NEXT:  # %bb.14: # %atomicrmw.end31
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_15: # %atomicrmw.start26
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpl $65536, %eax # imm = 0x10000
+; CHECK-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; CHECK-NEXT:    cmovael %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_15
+; CHECK-NEXT:  # %bb.16: # %atomicrmw.end25
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_17: # %atomicrmw.start20
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpl $11, %eax
+; CHECK-NEXT:    movl $10, %ecx
+; CHECK-NEXT:    cmovael %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_17
+; CHECK-NEXT:  # %bb.18: # %atomicrmw.end19
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_19: # %atomicrmw.start14
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpl $2, %eax
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    cmovbl %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_19
+; CHECK-NEXT:  # %bb.20: # %atomicrmw.end13
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_21: # %atomicrmw.start8
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpl $11, %eax
+; CHECK-NEXT:    movl $10, %ecx
+; CHECK-NEXT:    cmovbl %eax, %ecx
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_21
+; CHECK-NEXT:  # %bb.22: # %atomicrmw.end7
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $1976, %eax # imm = 0x7B8
+; CHECK-NEXT:    xchgl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    movl $-10, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $1976, %eax # imm = 0x7B8
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_23: # %atomicrmw.start2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    orl $-1402, %ecx # imm = 0xFA86
+; CHECK-NEXT:    lock cmpxchgl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_23
+; CHECK-NEXT:  # %bb.24: # %atomicrmw.end1
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_25: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    notl %ebx
+; CHECK-NEXT:    orl $252645135, %ebx # imm = 0xF0F0F0F
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    orl $252645135, %ecx # imm = 0xF0F0F0F
+; CHECK-NEXT:    lock cmpxchg8b {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_25
+; CHECK-NEXT:  # %bb.26: # %atomicrmw.end
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    leal -4(%ebp), %esp
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
 	%argc.addr = alloca i32		; <ptr> [#uses=1]
 	%argv.addr = alloca ptr		; <ptr> [#uses=1]
@@ -23,117 +209,62 @@ entry:
 	store i32 3855, ptr %xort
 	store i32 4, ptr %temp
 	%tmp = load i32, ptr %temp
-        ; CHECK: lock
-        ; CHECK: xaddl
   %0 = atomicrmw add ptr %val1, i32 %tmp monotonic
 	store i32 %0, ptr %old
-        ; CHECK: lock
-        ; CHECK: xaddl
   %1 = atomicrmw sub ptr %val2, i32 30 monotonic
 	store i32 %1, ptr %old
-        ; CHECK: lock
-        ; CHECK: xaddl
   %2 = atomicrmw add ptr %val2, i32 1 monotonic
 	store i32 %2, ptr %old
-        ; CHECK: lock
-        ; CHECK: xaddl
   %3 = atomicrmw sub ptr %val2, i32 1 monotonic
 	store i32 %3, ptr %old
-        ; CHECK: andl
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %4 = atomicrmw and ptr %andt, i32 4080 monotonic
 	store i32 %4, ptr %old
-        ; CHECK: orl
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %5 = atomicrmw or ptr %ort, i32 4080 monotonic
 	store i32 %5, ptr %old
-        ; CHECK: xorl
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %6 = atomicrmw xor ptr %xort, i32 4080 monotonic
 	store i32 %6, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %7 = atomicrmw min ptr %val2, i32 16 monotonic
 	store i32 %7, ptr %old
 	%neg = sub i32 0, 1		; <i32> [#uses=1]
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %8 = atomicrmw min ptr %val2, i32 %neg monotonic
 	store i32 %8, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %9 = atomicrmw max ptr %val2, i32 1 monotonic
 	store i32 %9, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %10 = atomicrmw max ptr %val2, i32 0 monotonic
 	store i32 %10, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %11 = atomicrmw umax ptr %val2, i32 65535 monotonic
 	store i32 %11, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %12 = atomicrmw umax ptr %val2, i32 10 monotonic
 	store i32 %12, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %13 = atomicrmw umin ptr %val2, i32 1 monotonic
 	store i32 %13, ptr %old
-        ; CHECK: cmov
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %14 = atomicrmw umin ptr %val2, i32 10 monotonic
 	store i32 %14, ptr %old
-        ; CHECK: xchgl   %{{.*}}, {{.*}}(%esp)
   %15 = atomicrmw xchg ptr %val2, i32 1976 monotonic
 	store i32 %15, ptr %old
 	%neg1 = sub i32 0, 10		; <i32> [#uses=1]
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %pair16 = cmpxchg ptr %val2, i32 %neg1, i32 1 monotonic monotonic
   %16 = extractvalue { i32, i1 } %pair16, 0
 	store i32 %16, ptr %old
-        ; CHECK: lock
-        ; CHECK: cmpxchgl
   %pair17 = cmpxchg ptr %val2, i32 1976, i32 1 monotonic monotonic
   %17 = extractvalue { i32, i1 } %pair17, 0
 	store i32 %17, ptr %old
-        ; CHECK: movl  [[R17atomic:.*]], %eax
-        ; CHECK: movl %eax, %[[R17mask:[a-z]*]]
-        ; CHECK: notl %[[R17mask]]
-        ; CHECK: orl $-1402, %[[R17mask]]
-        ; CHECK: lock
-        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
-        ; CHECK: jne
-        ; CHECK: movl	%eax,
   %18 = atomicrmw nand ptr %val2, i32 1401 monotonic
   store i32 %18, ptr %old
-        ; CHECK: notl
-        ; CHECK: notl
-        ; CHECK: orl $252645135
-        ; CHECK: orl $252645135
-        ; CHECK: lock
-        ; CHECK: cmpxchg8b
   %19 = atomicrmw nand ptr %temp64, i64 17361641481138401520 monotonic
   store i64 %19, ptr %temp64
 	ret void
 }
 
 define void @test2(ptr addrspace(256) nocapture %P) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, %gs:(%edx)
+; CHECK-NEXT:    retl
 entry:
-; CHECK: lock
-; CHECK:	cmpxchgl	%{{.*}}, %gs:(%{{.*}})
 
   %pair0 = cmpxchg ptr addrspace(256) %P, i32 0, i32 1 monotonic monotonic
   %0 = extractvalue { i32, i1 } %pair0, 0
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index 71887e369bd18..a49c9943f2e73 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -5,23 +5,21 @@
 define zeroext i8 @atomic_shl1_or_8_gpr_val(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_8_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzbl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orb %dl, %cl
-; X86-NEXT:    lock cmpxchgb %cl, (%esi)
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    orb %dl, %ah
+; X86-NEXT:    lock cmpxchgb %ah, (%ecx)
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    andb %al, %dl
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_8_gpr_val:
@@ -55,27 +53,25 @@ define zeroext i8 @atomic_shl1_mask0_or_8_gpr_val(ptr %v, i8 zeroext %c) nounwin
 ; X86-LABEL: atomic_shl1_mask0_or_8_gpr_val:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $7, %cl
 ; X86-NEXT:    movb $1, %ah
 ; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movb (%esi), %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB1_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orb %ah, %cl
-; X86-NEXT:    lock cmpxchgb %cl, (%esi)
+; X86-NEXT:    movb %al, %dh
+; X86-NEXT:    orb %ah, %dh
+; X86-NEXT:    lock cmpxchgb %dh, (%ecx)
 ; X86-NEXT:    jne .LBB1_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movl $1, %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    andb %bl, %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
@@ -113,24 +109,22 @@ entry:
 define zeroext i8 @atomic_shl1_mask01_or_8_gpr_val(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_8_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $7, %cl
 ; X86-NEXT:    movb $1, %dl
 ; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orb %dl, %cl
-; X86-NEXT:    lock cmpxchgb %cl, (%esi)
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    orb %dl, %ah
+; X86-NEXT:    lock cmpxchgb %ah, (%ecx)
 ; X86-NEXT:    jne .LBB2_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    andb %al, %dl
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_8_gpr_val:
@@ -163,24 +157,22 @@ entry:
 define zeroext i8 @atomic_shl1_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_xor_8_gpr_valz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzbl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorb %dl, %cl
-; X86-NEXT:    lock cmpxchgb %cl, (%esi)
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    xorb %dl, %ah
+; X86-NEXT:    lock cmpxchgb %ah, (%ecx)
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    testl %eax, %edx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_xor_8_gpr_valz:
@@ -217,27 +209,25 @@ entry:
 define zeroext i8 @atomic_shl1_mask0_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_xor_8_gpr_valz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $7, %cl
 ; X86-NEXT:    movb $1, %ah
 ; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movb (%esi), %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorb %ah, %cl
-; X86-NEXT:    lock cmpxchgb %cl, (%esi)
+; X86-NEXT:    movb %al, %dh
+; X86-NEXT:    xorb %ah, %dh
+; X86-NEXT:    lock cmpxchgb %dh, (%ecx)
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movzbl %dl, %ecx
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    btl %ecx, %eax
 ; X86-NEXT:    setae %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_xor_8_gpr_valz:
@@ -276,25 +266,23 @@ entry:
 define zeroext i8 @atomic_shl1_mask01_xor_8_gpr_valz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_xor_8_gpr_valz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movl $1, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movzbl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorb %bl, %cl
-; X86-NEXT:    lock cmpxchgb %cl, (%edx)
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    xorb %dl, %ah
+; X86-NEXT:    lock cmpxchgb %ah, (%ecx)
 ; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    testl %eax, %ebx
+; X86-NEXT:    testl %eax, %edx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_xor_8_gpr_valz:
@@ -333,33 +321,33 @@ entry:
 define zeroext i8 @atomic_shl1_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_and_8_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movb %bl, %ah
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movb %dl, %ah
 ; X86-NEXT:    notb %ah
-; X86-NEXT:    movb (%edx), %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movb (%esi), %al
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movb %al, %ch
 ; X86-NEXT:    andb %ah, %ch
-; X86-NEXT:    lock cmpxchgb %ch, (%edx)
+; X86-NEXT:    lock cmpxchgb %ch, (%esi)
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    testl %eax, %ebx
+; X86-NEXT:    testl %eax, %edx
 ; X86-NEXT:    je .LBB6_3
 ; X86-NEXT:  # %bb.4: # %if.then
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    movzbl (%edx,%eax), %eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movzbl (%esi,%eax), %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB6_3:
 ; X86-NEXT:    movb $123, %al
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_and_8_gpr_brnz:
@@ -413,11 +401,10 @@ return:                                           ; preds = %entry, %if.then
 define zeroext i8 @atomic_shl1_mask0_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_and_8_gpr_brnz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb $-2, %ah
 ; X86-NEXT:    rolb %cl, %ah
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb (%edx), %al
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB7_1: # %atomicrmw.start
@@ -427,17 +414,15 @@ define zeroext i8 @atomic_shl1_mask0_and_8_gpr_brnz(ptr %v, i8 zeroext %c) nounw
 ; X86-NEXT:    lock cmpxchgb %ch, (%edx)
 ; X86-NEXT:    jne .LBB7_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    btl %eax, %esi
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    btl %ecx, %eax
 ; X86-NEXT:    jae .LBB7_3
 ; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movzbl (%edx,%eax), %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movzbl (%edx,%ecx), %eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB7_3:
 ; X86-NEXT:    movb $123, %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_and_8_gpr_brnz:
@@ -491,29 +476,29 @@ define zeroext i8 @atomic_shl1_mask01_and_8_gpr_brnz(ptr %v, i8 zeroext %c) noun
 ; X86-LABEL: atomic_shl1_mask01_and_8_gpr_brnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb %ah, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $7, %cl
 ; X86-NEXT:    movl $1, %ebx
 ; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    movb (%edx), %al
+; X86-NEXT:    movb %bl, %ah
+; X86-NEXT:    notb %ah
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movb %al, %ch
-; X86-NEXT:    andb %cl, %ch
-; X86-NEXT:    lock cmpxchgb %ch, (%edx)
+; X86-NEXT:    movb %al, %dh
+; X86-NEXT:    andb %ah, %dh
+; X86-NEXT:    lock cmpxchgb %dh, (%ecx)
 ; X86-NEXT:    jne .LBB8_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    testl %ecx, %ebx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    testl %eax, %ebx
 ; X86-NEXT:    je .LBB8_3
 ; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movzbl %ah, %eax
-; X86-NEXT:    movzbl (%edx,%eax), %eax
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    movzbl (%ecx,%eax), %eax
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB8_3:
@@ -705,18 +690,18 @@ define zeroext i16 @atomic_shl1_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind
 ; X86-LABEL: atomic_shl1_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB12_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %si, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB12_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -757,9 +742,9 @@ entry:
 define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btcw %cx, (%edx)
 ; X86-NEXT:    setb %al
@@ -793,27 +778,27 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB14_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB14_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -858,17 +843,17 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB15_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB15_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -876,8 +861,8 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_val(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $cx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -919,9 +904,9 @@ entry:
 define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btcw %cx, (%edx)
 ; X86-NEXT:    setb %al
@@ -953,11 +938,11 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_val(ptr %v, i16 zeroext %c) nounwind
 ; X86-LABEL: atomic_blsi_xor_16_gpr_val:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB17_1: # %atomicrmw.start
@@ -1005,18 +990,18 @@ define zeroext i16 @atomic_shl1_xor_16_gpr_valz(ptr %v, i16 zeroext %c) nounwind
 ; X86-LABEL: atomic_shl1_xor_16_gpr_valz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB18_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %si, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB18_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -1068,11 +1053,11 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_valz(ptr %v, i16 zeroext %
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    movzwl %si, %esi
 ; X86-NEXT:    .p2align 4
@@ -1137,19 +1122,19 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_valz(ptr %v, i16 zeroext %c) no
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB20_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB20_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -1204,22 +1189,22 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_valz(ptr %v, i16 zeroext %c) no
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB21_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB21_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %edx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movzwl %ax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setae %al
@@ -1270,25 +1255,25 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_valz(ptr %v, i16 zeroext %c) n
 ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_valz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB22_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
+; X86-NEXT:    lock cmpxchgw %si, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB22_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzwl %ax, %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testl %ecx, %esi
+; X86-NEXT:    testl %ecx, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -1335,11 +1320,11 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_valz(ptr %v, i16 zeroext %c) nounwind
 ; X86-LABEL: atomic_blsi_xor_16_gpr_valz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB23_1: # %atomicrmw.start
@@ -1398,18 +1383,18 @@ define zeroext i16 @atomic_shl1_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) nounwin
 ; X86-LABEL: atomic_shl1_xor_16_gpr_valnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB24_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %si, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB24_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -1461,11 +1446,11 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_valnz(ptr %v, i16 zeroext
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    movzwl %si, %esi
 ; X86-NEXT:    .p2align 4
@@ -1530,19 +1515,19 @@ define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) n
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB26_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB26_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -1597,22 +1582,22 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) n
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB27_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB27_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %edx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movzwl %ax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setb %al
@@ -1663,25 +1648,25 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_valnz(ptr %v, i16 zeroext %c)
 ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_valnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB28_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
+; X86-NEXT:    lock cmpxchgw %si, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB28_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzwl %ax, %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testl %ecx, %esi
+; X86-NEXT:    testl %ecx, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -1728,11 +1713,11 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_valnz(ptr %v, i16 zeroext %c) nounwin
 ; X86-LABEL: atomic_blsi_xor_16_gpr_valnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB29_1: # %atomicrmw.start
@@ -1793,9 +1778,9 @@ define zeroext i16 @atomic_shl1_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nounwind
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB30_1: # %atomicrmw.start
@@ -1871,11 +1856,11 @@ define zeroext i16 @atomic_shl1_small_mask_xor_16_gpr_brz(ptr %v, i16 zeroext %c
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    movzwl %si, %esi
 ; X86-NEXT:    .p2align 4
@@ -1953,35 +1938,35 @@ return:                                           ; preds = %entry, %if.then
 define zeroext i16 @atomic_shl1_mask0_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_xor_16_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB32_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB32_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movzwl %ax, %esi
 ; X86-NEXT:    movw $123, %ax
-; X86-NEXT:    btl %ebx, %ecx
+; X86-NEXT:    btl %edx, %esi
 ; X86-NEXT:    jb .LBB32_4
 ; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movzwl %bx, %eax
-; X86-NEXT:    movzwl (%edx,%eax,2), %eax
+; X86-NEXT:    movzwl %dx, %eax
+; X86-NEXT:    movzwl (%ecx,%eax,2), %eax
 ; X86-NEXT:  .LBB32_4: # %return
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_xor_16_gpr_brz:
@@ -2039,9 +2024,9 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB33_1: # %atomicrmw.start
@@ -2053,11 +2038,11 @@ define zeroext i16 @atomic_shl1_mask1_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB33_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $15, %edi
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $15, %esi
+; X86-NEXT:    movzwl %ax, %edi
 ; X86-NEXT:    movw $123, %ax
-; X86-NEXT:    btl %edi, %esi
+; X86-NEXT:    btl %esi, %edi
 ; X86-NEXT:    jb .LBB33_4
 ; X86-NEXT:  # %bb.3: # %if.then
 ; X86-NEXT:    movzwl %cx, %eax
@@ -2122,35 +2107,35 @@ return:                                           ; preds = %entry, %if.then
 define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_xor_16_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB34_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB34_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movzwl %ax, %edi
 ; X86-NEXT:    movw $123, %ax
-; X86-NEXT:    testl %ecx, %esi
+; X86-NEXT:    testl %edi, %esi
 ; X86-NEXT:    jne .LBB34_4
 ; X86-NEXT:  # %bb.3: # %if.then
-; X86-NEXT:    movzwl %bx, %eax
-; X86-NEXT:    movzwl (%edx,%eax,2), %eax
+; X86-NEXT:    movzwl %dx, %eax
+; X86-NEXT:    movzwl (%ecx,%eax,2), %eax
 ; X86-NEXT:  .LBB34_4: # %return
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz:
@@ -2210,10 +2195,10 @@ define zeroext i16 @atomic_blsi_xor_16_gpr_brz(ptr %v, i16 zeroext %c) nounwind
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
 ; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB35_1: # %atomicrmw.start
@@ -2290,20 +2275,20 @@ define zeroext i16 @atomic_shl1_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-2, %esi
+; X86-NEXT:    roll %cl, %esi
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl $-2, %edi
-; X86-NEXT:    roll %cl, %edi
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB36_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    andl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB36_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2348,9 +2333,9 @@ entry:
 define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_and_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btrw %cx, (%edx)
 ; X86-NEXT:    setb %al
@@ -2385,25 +2370,25 @@ define zeroext i16 @atomic_shl1_mask0_and_16_gpr_val(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movw $-2, %si
-; X86-NEXT:    rolw %cl, %si
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movw $-2, %dx
+; X86-NEXT:    rolw %cl, %dx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB38_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB38_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $cx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -2448,17 +2433,17 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_val(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $-2, %esi
-; X86-NEXT:    roll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $-2, %edx
+; X86-NEXT:    roll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB39_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB39_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2466,8 +2451,8 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_val(ptr %v, i16 zeroext %c) nou
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $cx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -2510,9 +2495,9 @@ entry:
 define zeroext i16 @atomic_shl1_mask01_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_and_16_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btrw %cx, (%edx)
 ; X86-NEXT:    setb %al
@@ -2546,21 +2531,21 @@ define zeroext i16 @atomic_blsi_and_16_gpr_val(ptr %v, i16 zeroext %c) nounwind
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB41_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB41_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2605,20 +2590,20 @@ define zeroext i16 @atomic_shl1_and_16_gpr_valnz(ptr %v, i16 zeroext %c) nounwin
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl $-2, %edi
-; X86-NEXT:    roll %cl, %edi
-; X86-NEXT:    movzwl (%esi), %eax
+; X86-NEXT:    movl $-2, %esi
+; X86-NEXT:    roll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB42_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    andl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%esi)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB42_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2674,19 +2659,19 @@ define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_valnz(ptr %v, i16 zeroext
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
-; X86-NEXT:    movw $-2, %si
-; X86-NEXT:    rolw %cl, %si
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movw $-2, %dx
+; X86-NEXT:    rolw %cl, %dx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB43_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB43_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2742,17 +2727,17 @@ define zeroext i16 @atomic_shl1_mask0_and_16_gpr_valnz(ptr %v, i16 zeroext %c) n
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movw $-2, %si
-; X86-NEXT:    rolw %cl, %si
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movw $-2, %dx
+; X86-NEXT:    rolw %cl, %dx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB44_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB44_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -2807,22 +2792,22 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_valnz(ptr %v, i16 zeroext %c) n
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $-2, %esi
-; X86-NEXT:    roll %cl, %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $-2, %edx
+; X86-NEXT:    roll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB45_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB45_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %edx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movzwl %ax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setb %al
@@ -2875,27 +2860,27 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_valnz(ptr %v, i16 zeroext %c)
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl $-2, %edi
-; X86-NEXT:    roll %cl, %edi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $-2, %esi
+; X86-NEXT:    roll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB46_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    andl %esi, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB46_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzwl %ax, %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testl %ecx, %esi
+; X86-NEXT:    testl %ecx, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -2947,21 +2932,21 @@ define zeroext i16 @atomic_blsi_and_16_gpr_valnz(ptr %v, i16 zeroext %c) nounwin
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB47_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    andl %edx, %edi
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %di, (%edx)
+; X86-NEXT:    lock cmpxchgw %di, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB47_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -3019,28 +3004,28 @@ define zeroext i16 @atomic_shl1_and_16_gpr_brnz(ptr %v, i16 zeroext %c) nounwind
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movl $-2, %edi
 ; X86-NEXT:    roll %cl, %edi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB48_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    andl %edi, %ebx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %bx, (%edx)
+; X86-NEXT:    lock cmpxchgw %bx, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB48_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    testl %eax, %esi
+; X86-NEXT:    testl %eax, %edx
 ; X86-NEXT:    je .LBB48_3
 ; X86-NEXT:  # %bb.4: # %if.then
 ; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl (%edx,%eax,2), %eax
+; X86-NEXT:    movzwl (%esi,%eax,2), %eax
 ; X86-NEXT:    jmp .LBB48_5
 ; X86-NEXT:  .LBB48_3:
 ; X86-NEXT:    movw $123, %ax
@@ -3105,11 +3090,11 @@ define zeroext i16 @atomic_shl1_small_mask_and_16_gpr_brnz(ptr %v, i16 zeroext %
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $7, %ecx
 ; X86-NEXT:    movw $-2, %si
 ; X86-NEXT:    rolw %cl, %si
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB49_1: # %atomicrmw.start
@@ -3191,9 +3176,9 @@ define zeroext i16 @atomic_shl1_mask0_and_16_gpr_brnz(ptr %v, i16 zeroext %c) no
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movw $-2, %si
 ; X86-NEXT:    rolw %cl, %si
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB50_1: # %atomicrmw.start
@@ -3274,9 +3259,9 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_brnz(ptr %v, i16 zeroext %c) no
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $-2, %esi
 ; X86-NEXT:    roll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB51_1: # %atomicrmw.start
@@ -3288,9 +3273,9 @@ define zeroext i16 @atomic_shl1_mask1_and_16_gpr_brnz(ptr %v, i16 zeroext %c) no
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB51_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    andl $15, %esi
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    btl %esi, %eax
 ; X86-NEXT:    jae .LBB51_3
 ; X86-NEXT:  # %bb.4: # %if.then
@@ -3363,22 +3348,22 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $-2, %edi
 ; X86-NEXT:    roll %cl, %edi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB52_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    andl %edi, %ebx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %cx, (%edx)
+; X86-NEXT:    lock cmpxchgw %bx, (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB52_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
@@ -3386,8 +3371,8 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
 ; X86-NEXT:    testl %eax, %esi
 ; X86-NEXT:    je .LBB52_3
 ; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movzwl %bx, %eax
-; X86-NEXT:    movzwl (%edx,%eax,2), %eax
+; X86-NEXT:    movzwl %dx, %eax
+; X86-NEXT:    movzwl (%ecx,%eax,2), %eax
 ; X86-NEXT:    jmp .LBB52_5
 ; X86-NEXT:  .LBB52_3:
 ; X86-NEXT:    movw $123, %ax
@@ -3458,29 +3443,29 @@ define zeroext i16 @atomic_blsi_and_16_gpr_brnz(ptr %v, i16 zeroext %c) nounwind
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    notl %edi
-; X86-NEXT:    movzwl (%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB53_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    andl %edi, %ebx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    lock cmpxchgw %bx, (%edx)
+; X86-NEXT:    lock cmpxchgw %bx, (%esi)
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    jne .LBB53_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    testl %eax, %esi
+; X86-NEXT:    testl %eax, %edx
 ; X86-NEXT:    je .LBB53_3
 ; X86-NEXT:  # %bb.4: # %if.then
 ; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl (%edx,%eax,2), %eax
+; X86-NEXT:    movzwl (%esi,%eax,2), %eax
 ; X86-NEXT:    jmp .LBB53_5
 ; X86-NEXT:  .LBB53_3:
 ; X86-NEXT:    movw $123, %ax
@@ -3724,9 +3709,9 @@ entry:
 define zeroext i16 @atomic_shl1_and_16_const_brz(ptr %v) nounwind {
 ; X86-LABEL: atomic_shl1_and_16_const_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movw $123, %ax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btrw $4, (%ecx)
-; X86-NEXT:    movw $123, %ax
 ; X86-NEXT:    jae .LBB59_1
 ; X86-NEXT:  # %bb.2: # %return
 ; X86-NEXT:    retl
@@ -3763,9 +3748,9 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btsl %ecx, (%edx)
 ; X86-NEXT:    setb %al
@@ -3793,9 +3778,9 @@ entry:
 define i32 @atomic_shl1_small_mask_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btsl %ecx, (%edx)
 ; X86-NEXT:    setb %al
@@ -3824,9 +3809,9 @@ entry:
 define i32 @atomic_shl1_mask0_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btsl %ecx, (%edx)
 ; X86-NEXT:    setb %al
@@ -3856,9 +3841,9 @@ entry:
 define i32 @atomic_shl1_mask1_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btsl %ecx, (%edx)
 ; X86-NEXT:    setb %al
@@ -3888,9 +3873,9 @@ entry:
 define i32 @atomic_shl1_mask01_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    lock btsl %ecx, (%edx)
 ; X86-NEXT:    setb %al
@@ -3920,11 +3905,11 @@ define i32 @atomic_blsi_or_32_gpr_val(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_blsi_or_32_gpr_val:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB65_1: # %atomicrmw.start
@@ -3968,16 +3953,16 @@ define i32 @atomic_shl1_or_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB66_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB66_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4021,18 +4006,18 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB67_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB67_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4079,16 +4064,16 @@ define i32 @atomic_shl1_mask0_or_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB68_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB68_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4134,16 +4119,16 @@ define i32 @atomic_shl1_mask1_or_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB69_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB69_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4188,18 +4173,18 @@ define i32 @atomic_shl1_mask01_or_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB70_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB70_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4244,11 +4229,11 @@ define i32 @atomic_blsi_or_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_blsi_or_32_gpr_valz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    negl %edx
 ; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB71_1: # %atomicrmw.start
@@ -4300,16 +4285,16 @@ define i32 @atomic_shl1_or_32_gpr_valnz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB72_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB72_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4352,18 +4337,18 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_valnz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB73_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB73_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4409,16 +4394,16 @@ define i32 @atomic_shl1_mask0_or_32_gpr_valnz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB74_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB74_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4462,17 +4447,17 @@ define i32 @atomic_shl1_mask1_or_32_gpr_valnz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB75_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB75_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4516,18 +4501,18 @@ define i32 @atomic_shl1_mask01_or_32_gpr_valnz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB76_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB76_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -4571,11 +4556,11 @@ define i32 @atomic_blsi_or_32_gpr_valnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_blsi_or_32_gpr_valnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    negl %edx
 ; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB77_1: # %atomicrmw.start
@@ -4625,9 +4610,9 @@ define i32 @atomic_shl1_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB78_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -4672,12 +4657,12 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    lock btsl %ecx, (%eax)
+; X86-NEXT:    lock btsl %eax, (%ecx)
 ; X86-NEXT:    jae .LBB79_1
 ; X86-NEXT:  # %bb.2: # %if.then
-; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB79_1:
 ; X86-NEXT:    movl $123, %eax
@@ -4718,9 +4703,9 @@ define i32 @atomic_shl1_mask0_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB80_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -4767,9 +4752,9 @@ define i32 @atomic_shl1_mask1_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB81_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -4816,9 +4801,9 @@ define i32 @atomic_shl1_mask01_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_br:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB82_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -4866,23 +4851,23 @@ define i32 @atomic_blsi_or_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB83_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB83_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
+; X86-NEXT:    testl %edx, %eax
 ; X86-NEXT:    je .LBB83_3
 ; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
+; X86-NEXT:    movl (%esi,%ecx,4), %eax
 ; X86-NEXT:    jmp .LBB83_5
 ; X86-NEXT:  .LBB83_3:
 ; X86-NEXT:    movl $123, %eax
@@ -4936,17 +4921,20 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $31, %eax
-; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $31, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock btsl %esi, (%edx)
 ; X86-NEXT:    jae .LBB84_1
 ; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB84_1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_or_32_gpr_brz:
@@ -4984,15 +4972,15 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $15, %edx
-; X86-NEXT:    lock btsl %edx, (%ecx)
+; X86-NEXT:    andl $15, %ecx
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock btsl %ecx, (%edx)
 ; X86-NEXT:    jae .LBB85_1
 ; X86-NEXT:  # %bb.2: # %return
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB85_1: # %if.then
-; X86-NEXT:    movl (%ecx,%edx,4), %eax
+; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_or_32_gpr_brz:
@@ -5029,17 +5017,20 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask0_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $31, %eax
-; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $31, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock btsl %esi, (%edx)
 ; X86-NEXT:    jae .LBB86_1
 ; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB86_1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask0_or_32_gpr_brz:
@@ -5078,17 +5069,20 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask1_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $31, %eax
-; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $31, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock btsl %esi, (%edx)
 ; X86-NEXT:    jae .LBB87_1
 ; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB87_1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask1_or_32_gpr_brz:
@@ -5127,17 +5121,20 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask01_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $31, %eax
-; X86-NEXT:    lock btsl %eax, (%edx)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $31, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock btsl %esi, (%edx)
 ; X86-NEXT:    jae .LBB88_1
 ; X86-NEXT:  # %bb.2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB88_1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_or_32_gpr_brz:
@@ -5178,10 +5175,10 @@ define i32 @atomic_blsi_or_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    negl %edi
 ; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB89_1: # %atomicrmw.start
@@ -5250,9 +5247,9 @@ define i32 @atomic_shl1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB90_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -5297,12 +5294,12 @@ define i32 @atomic_shl1_small_mask_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    lock btsl %ecx, (%eax)
+; X86-NEXT:    lock btsl %eax, (%ecx)
 ; X86-NEXT:    jae .LBB91_1
 ; X86-NEXT:  # %bb.2: # %if.then
-; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB91_1:
 ; X86-NEXT:    movl $123, %eax
@@ -5343,9 +5340,9 @@ define i32 @atomic_shl1_mask0_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask0_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB92_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -5392,9 +5389,9 @@ define i32 @atomic_shl1_mask1_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask1_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB93_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -5441,9 +5438,9 @@ define i32 @atomic_shl1_mask01_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_or_32_gpr_brnz:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $31, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btsl %edx, (%ecx)
 ; X86-NEXT:    jae .LBB94_1
 ; X86-NEXT:  # %bb.2: # %if.then
@@ -5491,23 +5488,23 @@ define i32 @atomic_blsi_or_32_gpr_brnz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB95_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB95_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    testl %esi, %eax
+; X86-NEXT:    testl %edx, %eax
 ; X86-NEXT:    je .LBB95_3
 ; X86-NEXT:  # %bb.4: # %if.then
-; X86-NEXT:    movl (%edx,%ecx,4), %eax
+; X86-NEXT:    movl (%esi,%ecx,4), %eax
 ; X86-NEXT:    jmp .LBB95_5
 ; X86-NEXT:  .LBB95_3:
 ; X86-NEXT:    movl $123, %eax
@@ -5564,16 +5561,16 @@ define i32 @atomic_shl1_and_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB96_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB96_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -5617,18 +5614,18 @@ define i32 @atomic_shl1_small_mask_and_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB97_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB97_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -5675,16 +5672,16 @@ define i32 @atomic_shl1_mask0_and_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB98_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB98_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -5730,16 +5727,16 @@ define i32 @atomic_shl1_mask1_and_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB99_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB99_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -5784,18 +5781,18 @@ define i32 @atomic_shl1_mask01_and_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB100_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    lock cmpxchgl %edi, (%edx)
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    lock cmpxchgl %edi, (%esi)
 ; X86-NEXT:    jne .LBB100_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
 ; X86-NEXT:    xorl %edx, %edx
@@ -5839,13 +5836,13 @@ entry:
 define i32 @atomic_blsi_and_32_gpr_valz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_blsi_and_32_gpr_valz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    lock andl %edx, (%ecx)
+; X86-NEXT:    lock andl %ecx, (%edx)
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
@@ -5871,16 +5868,18 @@ entry:
 define i32 @atomic_shl1_and_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_and_32_gpr_br:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    je .LBB102_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB102_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_and_32_gpr_br:
@@ -5917,17 +5916,19 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_small_mask_and_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_and_32_gpr_br:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    je .LBB103_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB103_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_and_32_gpr_br:
@@ -5969,9 +5970,9 @@ define i32 @atomic_shl1_mask0_and_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB104_1: # %atomicrmw.start
@@ -6042,9 +6043,9 @@ define i32 @atomic_shl1_mask1_and_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%edx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB105_1: # %atomicrmw.start
@@ -6112,16 +6113,18 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask01_and_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_and_32_gpr_br:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    je .LBB106_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB106_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_and_32_gpr_br:
@@ -6159,17 +6162,19 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_blsi_and_32_gpr_br(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_blsi_and_32_gpr_br:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    andl %ecx, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    je .LBB107_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB107_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_blsi_and_32_gpr_br:
@@ -6207,16 +6212,18 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_and_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_and_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    jne .LBB108_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB108_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_and_32_gpr_brz:
@@ -6253,17 +6260,19 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_small_mask_and_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_small_mask_and_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    jne .LBB109_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB109_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_small_mask_and_32_gpr_brz:
@@ -6305,9 +6314,9 @@ define i32 @atomic_shl1_mask0_and_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB110_1: # %atomicrmw.start
@@ -6379,9 +6388,9 @@ define i32 @atomic_shl1_mask1_and_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl $1, %edx
 ; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB111_1: # %atomicrmw.start
@@ -6450,16 +6459,18 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_mask01_and_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_shl1_mask01_and_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    jne .LBB112_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB112_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_shl1_mask01_and_32_gpr_brz:
@@ -6497,17 +6508,19 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_blsi_and_32_gpr_brz(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_blsi_and_32_gpr_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    lock andl %eax, (%edx)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    andl %ecx, %esi
 ; X86-NEXT:    movl $123, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    lock andl %esi, (%edx)
 ; X86-NEXT:    jne .LBB113_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl (%edx,%ecx,4), %eax
 ; X86-NEXT:  .LBB113_2: # %return
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: atomic_blsi_and_32_gpr_brz:
@@ -6792,9 +6805,9 @@ return:                                           ; preds = %entry, %if.then
 define i32 @atomic_shl1_and_32_const_brz(ptr %v) nounwind {
 ; X86-LABEL: atomic_shl1_and_32_const_brz:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $123, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    lock btrl $4, (%ecx)
-; X86-NEXT:    movl $123, %eax
 ; X86-NEXT:    jae .LBB121_1
 ; X86-NEXT:  # %bb.2: # %return
 ; X86-NEXT:    retl
@@ -6833,21 +6846,21 @@ define i32 @atomic_xor_dead_and(ptr %v, i32 %c) nounwind {
 ; X86-LABEL: atomic_xor_dead_and:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB122_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    lock cmpxchgl %ecx, (%edx)
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    lock cmpxchgl %esi, (%ecx)
 ; X86-NEXT:    jne .LBB122_1
 ; X86-NEXT:  # %bb.2: # %atomicrmw.end
-; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    andl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -6883,9 +6896,9 @@ define i32 @atomic_xor_with_not_arg(ptr %v, i32 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB123_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -6921,9 +6934,9 @@ define i16 @atomic_or_with_not_arg(ptr %v, i16 %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    movzwl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB124_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -6964,9 +6977,9 @@ define i8 @atomic_and_with_not_arg(ptr %v, i8 %c) nounwind {
 ; X86-LABEL: atomic_and_with_not_arg:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl (%ecx), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    notb %dl
-; X86-NEXT:    movzbl (%ecx), %eax
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB125_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll
index 3e1a631e39b06..46ec1d94d353c 100644
--- a/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/X86/atomicrmw-cond-sub-clamp.ll
@@ -5,19 +5,19 @@
 define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; CHECK-32-LABEL: atomicrmw_usub_cond_i8:
 ; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    movzbl (%edx), %eax
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movzbl (%ecx), %eax
+; CHECK-32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; CHECK-32-NEXT:    jmp .LBB0_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB0_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-32-NEXT:    lock cmpxchgb %ah, (%edx)
+; CHECK-32-NEXT:    lock cmpxchgb %ah, (%ecx)
 ; CHECK-32-NEXT:    je .LBB0_4
 ; CHECK-32-NEXT:  .LBB0_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    movb %al, %ah
-; CHECK-32-NEXT:    subb %cl, %ah
+; CHECK-32-NEXT:    subb %dl, %ah
 ; CHECK-32-NEXT:    jae .LBB0_3
 ; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -51,21 +51,21 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; CHECK-32-NEXT:    pushl %esi
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-32-NEXT:    .cfi_offset %esi, -8
-; CHECK-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    movzwl (%edx), %eax
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movzwl (%ecx), %eax
+; CHECK-32-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; CHECK-32-NEXT:    jmp .LBB1_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB1_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; CHECK-32-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-32-NEXT:    lock cmpxchgw %si, (%edx)
+; CHECK-32-NEXT:    lock cmpxchgw %si, (%ecx)
 ; CHECK-32-NEXT:    # kill: def $ax killed $ax def $eax
 ; CHECK-32-NEXT:    je .LBB1_4
 ; CHECK-32-NEXT:  .LBB1_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    movl %eax, %esi
-; CHECK-32-NEXT:    subw %cx, %si
+; CHECK-32-NEXT:    subw %dx, %si
 ; CHECK-32-NEXT:    jae .LBB1_3
 ; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB1_1 Depth=1
@@ -104,18 +104,18 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-32-NEXT:    .cfi_offset %esi, -8
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    movl (%edx), %eax
 ; CHECK-32-NEXT:    jmp .LBB2_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB2_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB2_1 Depth=1
-; CHECK-32-NEXT:    lock cmpxchgl %esi, (%edx)
+; CHECK-32-NEXT:    lock cmpxchgl %esi, (%ecx)
 ; CHECK-32-NEXT:    je .LBB2_4
 ; CHECK-32-NEXT:  .LBB2_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    movl %eax, %esi
-; CHECK-32-NEXT:    subl %ecx, %esi
+; CHECK-32-NEXT:    subl %edx, %esi
 ; CHECK-32-NEXT:    jae .LBB2_3
 ; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB2_1 Depth=1
@@ -159,22 +159,22 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; CHECK-32-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-32-NEXT:    .cfi_offset %ebp, -8
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-32-NEXT:    movl (%esi), %eax
+; CHECK-32-NEXT:    movl 4(%esi), %edx
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-32-NEXT:    movl (%ebp), %eax
-; CHECK-32-NEXT:    movl 4(%ebp), %edx
 ; CHECK-32-NEXT:    jmp .LBB3_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB3_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; CHECK-32-NEXT:    lock cmpxchg8b (%ebp)
+; CHECK-32-NEXT:    lock cmpxchg8b (%esi)
 ; CHECK-32-NEXT:    je .LBB3_4
 ; CHECK-32-NEXT:  .LBB3_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    movl %eax, %ebx
-; CHECK-32-NEXT:    subl %edi, %ebx
+; CHECK-32-NEXT:    subl %ebp, %ebx
 ; CHECK-32-NEXT:    movl %edx, %ecx
-; CHECK-32-NEXT:    sbbl %esi, %ecx
+; CHECK-32-NEXT:    sbbl %edi, %ecx
 ; CHECK-32-NEXT:    jae .LBB3_3
 ; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB3_1 Depth=1
@@ -215,19 +215,19 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; CHECK-32-NEXT:    pushl %ebx
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-32-NEXT:    .cfi_offset %ebx, -8
-; CHECK-32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    movzbl (%edx), %eax
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movzbl (%ecx), %eax
+; CHECK-32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; CHECK-32-NEXT:    jmp .LBB4_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB4_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB4_1 Depth=1
-; CHECK-32-NEXT:    lock cmpxchgb %bl, (%edx)
+; CHECK-32-NEXT:    lock cmpxchgb %bl, (%ecx)
 ; CHECK-32-NEXT:    je .LBB4_4
 ; CHECK-32-NEXT:  .LBB4_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    movl %eax, %ebx
-; CHECK-32-NEXT:    subb %cl, %bl
+; CHECK-32-NEXT:    subb %dl, %bl
 ; CHECK-32-NEXT:    jae .LBB4_3
 ; CHECK-32-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
 ; CHECK-32-NEXT:    xorl %ebx, %ebx
@@ -265,20 +265,20 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-32-NEXT:    .cfi_offset %esi, -12
 ; CHECK-32-NEXT:    .cfi_offset %edi, -8
-; CHECK-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    movzwl (%edx), %eax
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movzwl (%ecx), %eax
+; CHECK-32-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; CHECK-32-NEXT:    jmp .LBB5_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB5_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; CHECK-32-NEXT:    lock cmpxchgw %si, (%edx)
+; CHECK-32-NEXT:    lock cmpxchgw %si, (%ecx)
 ; CHECK-32-NEXT:    je .LBB5_4
 ; CHECK-32-NEXT:  .LBB5_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    xorl %esi, %esi
 ; CHECK-32-NEXT:    movl %eax, %edi
-; CHECK-32-NEXT:    subw %cx, %di
+; CHECK-32-NEXT:    subw %dx, %di
 ; CHECK-32-NEXT:    jb .LBB5_3
 ; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB5_1 Depth=1
@@ -319,19 +319,19 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; CHECK-32-NEXT:    .cfi_offset %esi, -12
 ; CHECK-32-NEXT:    .cfi_offset %edi, -8
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    movl (%edx), %eax
 ; CHECK-32-NEXT:    jmp .LBB6_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB6_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; CHECK-32-NEXT:    lock cmpxchgl %esi, (%edx)
+; CHECK-32-NEXT:    lock cmpxchgl %esi, (%ecx)
 ; CHECK-32-NEXT:    je .LBB6_4
 ; CHECK-32-NEXT:  .LBB6_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    xorl %esi, %esi
 ; CHECK-32-NEXT:    movl %eax, %edi
-; CHECK-32-NEXT:    subl %ecx, %edi
+; CHECK-32-NEXT:    subl %edx, %edi
 ; CHECK-32-NEXT:    jb .LBB6_3
 ; CHECK-32-NEXT:  # %bb.2: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB6_1 Depth=1
@@ -377,25 +377,27 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; CHECK-32-NEXT:    .cfi_offset %edi, -16
 ; CHECK-32-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-32-NEXT:    .cfi_offset %ebp, -8
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    movl (%eax), %edi
+; CHECK-32-NEXT:    movl 4(%eax), %esi
 ; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-32-NEXT:    movl (%ebp), %esi
-; CHECK-32-NEXT:    movl 4(%ebp), %edi
 ; CHECK-32-NEXT:    jmp .LBB7_1
 ; CHECK-32-NEXT:    .p2align 4
 ; CHECK-32-NEXT:  .LBB7_3: # %atomicrmw.start
 ; CHECK-32-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; CHECK-32-NEXT:    movl %esi, %eax
-; CHECK-32-NEXT:    movl %edi, %edx
-; CHECK-32-NEXT:    lock cmpxchg8b (%ebp)
-; CHECK-32-NEXT:    movl %eax, %esi
-; CHECK-32-NEXT:    movl %edx, %edi
+; CHECK-32-NEXT:    movl %edi, %eax
+; CHECK-32-NEXT:    movl %esi, %edx
+; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-32-NEXT:    lock cmpxchg8b (%esi)
+; CHECK-32-NEXT:    movl %eax, %edi
+; CHECK-32-NEXT:    movl %edx, %esi
 ; CHECK-32-NEXT:    je .LBB7_4
 ; CHECK-32-NEXT:  .LBB7_1: # %atomicrmw.start
 ; CHECK-32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-32-NEXT:    xorl %ecx, %ecx
-; CHECK-32-NEXT:    movl %esi, %eax
-; CHECK-32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl %edi, %edx
+; CHECK-32-NEXT:    movl %edi, %eax
+; CHECK-32-NEXT:    subl %ebp, %eax
+; CHECK-32-NEXT:    movl %esi, %edx
 ; CHECK-32-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
 ; CHECK-32-NEXT:    movl $0, %ebx
 ; CHECK-32-NEXT:    jb .LBB7_3
@@ -405,8 +407,8 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; CHECK-32-NEXT:    movl %eax, %ebx
 ; CHECK-32-NEXT:    jmp .LBB7_3
 ; CHECK-32-NEXT:  .LBB7_4: # %atomicrmw.end
-; CHECK-32-NEXT:    movl %esi, %eax
-; CHECK-32-NEXT:    movl %edi, %edx
+; CHECK-32-NEXT:    movl %edi, %eax
+; CHECK-32-NEXT:    movl %esi, %edx
 ; CHECK-32-NEXT:    popl %esi
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-32-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/avx-cvt-3.ll b/llvm/test/CodeGen/X86/avx-cvt-3.ll
index 760db4af1f1b4..101162c1427bf 100644
--- a/llvm/test/CodeGen/X86/avx-cvt-3.ll
+++ b/llvm/test/CodeGen/X86/avx-cvt-3.ll
@@ -89,16 +89,16 @@ define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) {
 define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
 ; X86-LABEL: sitofp_insert_constants_v8i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7]
-; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    movl $2, %eax
-; X86-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
 ; X86-NEXT:    movl $-3, %eax
-; X86-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 46f67a8912d4a..6ed4d0b6e81ea 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -605,17 +605,17 @@ define void @test_x86_sse2_storeu_dq(ptr %a0, <16 x i8> %a1) {
   ; add operation forces the execution domain.
 ; X86-AVX-LABEL: test_x86_sse2_storeu_dq:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%eax) # encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_sse2_storeu_dq:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf8,0xc1]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
@@ -643,23 +643,23 @@ define void @test_x86_sse2_storeu_pd(ptr %a0, <2 x double> %a1) {
   ; fadd operation forces the execution domain.
 ; X86-AVX-LABEL: test_x86_sse2_storeu_pd:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX-NEXT:    vmovhpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    # xmm1 = xmm1[0],mem[0]
 ; X86-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovupd %xmm0, (%eax) # encoding: [0xc5,0xf9,0x11,0x00]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_sse2_storeu_pd:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX512VL-NEXT:    vmovhpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    # xmm1 = xmm1[0],mem[0]
 ; X86-AVX512VL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovupd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x00]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
@@ -722,21 +722,21 @@ define void @test_x86_avx_storeu_dq_256(ptr %a0, <32 x i8> %a1) {
   ; add operation forces the execution domain.
 ; X86-AVX-LABEL: test_x86_avx_storeu_dq_256:
 ; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1 # encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
+; X86-AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
+; X86-AVX-NEXT:    vpsubb %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xf8,0xca]
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm2 # encoding: [0xc5,0xf9,0xf8,0xd1]
-; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; X86-AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc1]
-; X86-AVX-NEXT:    vmovdqu %xmm0, 16(%eax) # encoding: [0xc5,0xfa,0x7f,0x40,0x10]
-; X86-AVX-NEXT:    vmovdqu %xmm2, (%eax) # encoding: [0xc5,0xfa,0x7f,0x10]
+; X86-AVX-NEXT:    vmovdqu %xmm1, 16(%eax) # encoding: [0xc5,0xfa,0x7f,0x48,0x10]
+; X86-AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf8,0xc2]
+; X86-AVX-NEXT:    vmovdqu %xmm0, (%eax) # encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx_storeu_dq_256:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9]
 ; X86-AVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf8,0xc1]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
@@ -770,18 +770,18 @@ define void @test_x86_avx_storeu_pd_256(ptr %a0, <4 x double> %a1) {
   ; add operation forces the execution domain.
 ; X86-AVX-LABEL: test_x86_avx_storeu_pd_256:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovupd %ymm0, (%eax) # encoding: [0xc5,0xfd,0x11,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx_storeu_pd_256:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovupd %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x11,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
index f0e8987bd6b2d..e182453cd949c 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -913,17 +913,17 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
 define void @movnt_dq(ptr %p, <2 x i64> %a1) nounwind {
 ; X86-AVX-LABEL: movnt_dq:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovntdq %xmm0, (%eax) # encoding: [0xc5,0xf9,0xe7,0x00]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: movnt_dq:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-AVX512VL-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovntdq %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe7,0x00]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
@@ -982,18 +982,18 @@ define void @movnt_pd(ptr %p, <4 x double> %a1) nounwind {
   ; add operation forces the execution domain.
 ; X86-AVX-LABEL: movnt_pd:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovntpd %ymm0, (%eax) # encoding: [0xc5,0xfd,0x2b,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: movnt_pd:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovntpd %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll
index 1b688c8cf9cca..5645fa06be3ac 100644
--- a/llvm/test/CodeGen/X86/avx-select.ll
+++ b/llvm/test/CodeGen/X86/avx-select.ll
@@ -5,8 +5,8 @@
 define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
 ; X86-LABEL: select00:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB0_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    vmovaps %ymm0, %ymm1
@@ -33,8 +33,8 @@ define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
 define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind {
 ; X86-LABEL: select01:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    vmovaps %ymm0, %ymm1
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index 0bfd8921e8b42..b1831782c1971 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -25,17 +25,13 @@ entry:
 define <4 x i64> @A2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: A2:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    vbroadcastsd (%ecx), %ymm0
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    vbroadcastsd (%eax), %ymm0
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: A2:
@@ -104,10 +100,10 @@ define <8 x i32> @B3(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: B3:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; X86-NEXT:    retl
@@ -158,8 +154,8 @@ define <4 x double> @C2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: C2:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastsd (%ecx), %ymm0
+; X86-NEXT:    vbroadcastsd (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -226,8 +222,8 @@ define <8 x float> @D3(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: D3:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastss (%ecx), %ymm0
+; X86-NEXT:    vbroadcastss (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -276,8 +272,8 @@ define <4 x float> @e2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: e2:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastss (%ecx), %xmm0
+; X86-NEXT:    vbroadcastss (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -340,10 +336,10 @@ define <4 x i32> @F2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: F2:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X86-NEXT:    retl
 ;
@@ -593,17 +589,13 @@ entry:
 define <2 x i64> @G2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: G2:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: G2:
@@ -658,8 +650,8 @@ define <2 x double> @I2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
 ; X86-LABEL: I2:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -680,8 +672,8 @@ define <4 x float> @_RR(ptr %ptr, ptr %k) nounwind uwtable readnone ssp {
 ; X86-LABEL: _RR:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastss (%ecx), %xmm0
+; X86-NEXT:    vbroadcastss (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%eax)
 ; X86-NEXT:    retl
@@ -848,8 +840,8 @@ define void @broadcast_v16i32(ptr %a, ptr %b) {
 ; X86-LABEL: broadcast_v16i32:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastss (%ecx), %ymm0
+; X86-NEXT:    vbroadcastss (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %ymm0, 32(%eax)
 ; X86-NEXT:    vmovups %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
@@ -878,12 +870,12 @@ define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonl
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmulpd (%eax), %xmm0, %xmm1
-; X86-NEXT:    vmulsd 16(%eax), %xmm0, %xmm0
 ; X86-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; X86-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
+; X86-NEXT:    vmulsd 16(%eax), %xmm0, %xmm0
 ; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll
index 9434bc22d9976..d4a3d60118a55 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcastf128.ll
@@ -105,8 +105,8 @@ define void @subv_reuse_is_ok(ptr %a, ptr %b) {
 ; X86-LABEL: subv_reuse_is_ok:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -127,8 +127,8 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_2f64_4f64_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -147,8 +147,8 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_2i64_4i64_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -167,8 +167,8 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_4f32_8f32_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -187,8 +187,8 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_4i32_8i32_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -207,8 +207,8 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind {
 ; X86-LABEL: test_broadcast_8i16_16i16_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -227,8 +227,8 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind {
 ; X86-LABEL: test_broadcast_16i8_32i8_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -247,9 +247,9 @@ define <8 x i32> @PR29088(ptr %p0, ptr %p1) {
 ; X86-LABEL: PR29088:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %ymm1, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index 4ce092c099b08..fb9a1833977d5 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -79,9 +79,9 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap
 ; X86-LABEL: insertps_from_vector_load_offset_2:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    vinsertps $0, 12(%ecx,%eax), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
@@ -144,13 +144,13 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
+; X86-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm2
 ; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
-; X86-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
-; X86-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
index e971d1e471bf7..65c29e71d3717 100644
--- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
+++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -7,9 +7,9 @@ define void @test1(ptr %A, ptr %C) #0 {
 ; X86-LABEL: test1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -33,9 +33,9 @@ define void @test2(ptr %A, ptr %C) #0 {
 ; X86-LABEL: test2:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vmovaps (%eax), %xmm0
 ; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -59,9 +59,9 @@ define void @test3(ptr %A, ptr %C) #0 {
 ; X86-LABEL: test3:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vmovaps (%eax), %xmm0
 ; X86-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -84,9 +84,9 @@ define void @test4(ptr %A, ptr %C) #0 {
 ; X86-LABEL: test4:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
+; X86-NEXT:    vmovaps (%eax), %xmm0
 ; X86-NEXT:    vandnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/avx10_2-cmp.ll b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
index 8117345d9de04..4c4a67c4425fa 100644
--- a/llvm/test/CodeGen/X86/avx10_2-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
@@ -47,8 +47,8 @@ define i1 @hoeq_mem(ptr %xp, ptr %yp) {
 ; X86-LABEL: hoeq_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vucomxsh (%eax), %xmm0
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -69,8 +69,8 @@ define i1 @hune_mem(ptr %xp, ptr %yp) {
 ; X86-LABEL: hune_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vucomxsh (%eax), %xmm0
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -125,8 +125,8 @@ define i1 @foeq_mem(ptr %xp, ptr %yp) {
 ; X86-LABEL: foeq_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vucomxss (%eax), %xmm0
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -147,8 +147,8 @@ define i1 @fune_mem(ptr %xp, ptr %yp) {
 ; X86-LABEL: fune_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vucomxss (%eax), %xmm0
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -203,8 +203,8 @@ define i1 @doeq_mem(ptr %xp, ptr %yp) {
 ; X86-LABEL: doeq_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vucomxsd (%eax), %xmm0
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -225,8 +225,8 @@ define i1 @dune_mem(ptr %xp, ptr %yp) {
 ; X86-LABEL: dune_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vucomxsd (%eax), %xmm0
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -253,11 +253,12 @@ define i32 @PR118606(x86_fp80 %val1) #0 {
 ;
 ; X86-LABEL: PR118606:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    fld1
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldz
 ; X86-NEXT:    fucomi %st(1), %st
 ; X86-NEXT:    fstp %st(1)
-; X86-NEXT:    fld1
+; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:    fcmovne %st(1), %st
 ; X86-NEXT:    fcmovu %st(1), %st
 ; X86-NEXT:    fucompi %st(1), %st
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index d9b4635042256..84ce9d5e02031 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -42,9 +42,9 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_add_bf16_512(<32 x bfloat> %src,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vaddbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0xc2]
-; X86-NEXT:    vaddbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0x08]
-; X86-NEXT:    vaddbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1]
+; X86-NEXT:    vaddbf16 (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0x00]
+; X86-NEXT:    vaddbf16 %zmm2, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x58,0xca]
+; X86-NEXT:    vaddbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i32 %msk to <32 x i1>
   %val = load <32 x bfloat>, ptr %ptr
@@ -94,9 +94,9 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -152,9 +152,9 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_mul_bf16_512(<32 x bfloat> %src,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmulbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0xc2]
-; X86-NEXT:    vmulbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0x08]
-; X86-NEXT:    vmulbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x59,0xc1]
+; X86-NEXT:    vmulbf16 (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0x00]
+; X86-NEXT:    vmulbf16 %zmm2, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x59,0xca]
+; X86-NEXT:    vmulbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x59,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i32 %msk to <32 x i1>
   %val = load <32 x bfloat>, ptr %ptr
@@ -206,9 +206,9 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_div_bf16_512(<32 x bfloat> %src,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdivbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0xc2]
-; X86-NEXT:    vdivbf16 (%eax), %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0x08]
-; X86-NEXT:    vdivbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x5e,0xc1]
+; X86-NEXT:    vdivbf16 (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0x00]
+; X86-NEXT:    vdivbf16 %zmm2, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5e,0xca]
+; X86-NEXT:    vdivbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x5e,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i32 %msk to <32 x i1>
   %val = load <32 x bfloat>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
index 9225bd88b0896..cd0640c3130f9 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
@@ -171,8 +171,8 @@ define <32 x bfloat>@test_int_x86_avx512_mask_getexp_bf16_512(<32 x bfloat> %x0,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_getexp_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vgetexpbf16 %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x42,0xc0]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc8]
 ; X86-NEXT:    vaddbf16 %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf5,0x75,0x48,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -219,8 +219,8 @@ define <32 x bfloat>@test_int_x86_avx512_mask_scalef_bf16_512(<32 x bfloat> %x0,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_scalef_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vscalefbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf6,0x7c,0x48,0x2c,0xc1]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd0]
 ; X86-NEXT:    vaddbf16 %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf5,0x6d,0x48,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index e9c6cb6a19ba4..3c3dc1f9cf54c 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -390,13 +390,12 @@ declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
 ; X86-LABEL: test_mm512_mask_mpsadbw:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xda]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmpsadbw $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xd9,0x02]
-; X86-NEXT:    vmpsadbw $3, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x49,0x42,0xe1,0x03]
+; X86-NEXT:    vmpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf3,0x7e,0x49,0x42,0xd9,0x03]
 ; X86-NEXT:    vmpsadbw $4, %zmm1, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xc9,0x42,0xd1,0x04]
-; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X86-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
+; X86-NEXT:    vmpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xc1,0x02]
+; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm512_mask_mpsadbw:
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 01b7618753a23..9de61c284e9b1 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -41,9 +41,9 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_add_bf16_256(<16 x bfloat> %src,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vaddbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0xc2]
-; X86-NEXT:    vaddbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0x08]
-; X86-NEXT:    vaddbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc1]
+; X86-NEXT:    vaddbf16 (%eax), %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0x00]
+; X86-NEXT:    vaddbf16 %ymm2, %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x58,0xca]
+; X86-NEXT:    vaddbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i16 %msk to <16 x i1>
   %val = load <16 x bfloat>, ptr %ptr
@@ -95,9 +95,9 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_add_bf16_128(<8 x bfloat> %src, <8
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vaddbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0xc2]
-; X86-NEXT:    vaddbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0x08]
-; X86-NEXT:    vaddbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x58,0xc1]
+; X86-NEXT:    vaddbf16 (%eax), %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0x00]
+; X86-NEXT:    vaddbf16 %xmm2, %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x58,0xca]
+; X86-NEXT:    vaddbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i8 %msk to <8 x i1>
   %val = load <8 x bfloat>, ptr %ptr
@@ -147,9 +147,9 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -201,9 +201,9 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -257,9 +257,9 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_mul_bf16_256(<16 x bfloat> %src,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmulbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0xc2]
-; X86-NEXT:    vmulbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0x08]
-; X86-NEXT:    vmulbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x59,0xc1]
+; X86-NEXT:    vmulbf16 (%eax), %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0x00]
+; X86-NEXT:    vmulbf16 %ymm2, %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x59,0xca]
+; X86-NEXT:    vmulbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x59,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i16 %msk to <16 x i1>
   %val = load <16 x bfloat>, ptr %ptr
@@ -311,9 +311,9 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_mul_bf16_128(<8 x bfloat> %src, <8
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmulbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0xc2]
-; X86-NEXT:    vmulbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0x08]
-; X86-NEXT:    vmulbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x59,0xc1]
+; X86-NEXT:    vmulbf16 (%eax), %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0x00]
+; X86-NEXT:    vmulbf16 %xmm2, %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x59,0xca]
+; X86-NEXT:    vmulbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x59,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i8 %msk to <8 x i1>
   %val = load <8 x bfloat>, ptr %ptr
@@ -366,9 +366,9 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_div_bf16_256(<16 x bfloat> %src,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdivbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0xc2]
-; X86-NEXT:    vdivbf16 (%eax), %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0x08]
-; X86-NEXT:    vdivbf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x5e,0xc1]
+; X86-NEXT:    vdivbf16 (%eax), %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0x00]
+; X86-NEXT:    vdivbf16 %ymm2, %ymm1, %ymm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5e,0xca]
+; X86-NEXT:    vdivbf16 %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf5,0x75,0x28,0x5e,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i16 %msk to <16 x i1>
   %val = load <16 x bfloat>, ptr %ptr
@@ -421,9 +421,9 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_div_bf16_128(<8 x bfloat> %src, <8
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vdivbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0xc2]
-; X86-NEXT:    vdivbf16 (%eax), %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0x08]
-; X86-NEXT:    vdivbf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7d,0x08,0x5e,0xc1]
+; X86-NEXT:    vdivbf16 (%eax), %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0x00]
+; X86-NEXT:    vdivbf16 %xmm2, %xmm1, %xmm1 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5e,0xca]
+; X86-NEXT:    vdivbf16 %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf5,0x75,0x08,0x5e,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %mask = bitcast i8 %msk to <8 x i1>
   %val = load <8 x bfloat>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index 3d79457eb2a8a..3c560444aac9d 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -12,12 +12,12 @@ declare  i48 @llvm.fptosi.sat.i48.f32 (float)
 define i24 @test_signed_i24_f32(float %f) nounwind {
 ; X86-LABEL: test_signed_i24_f32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $-8388608, %ecx # imm = 0xFF800000
 ; X86-NEXT:    vcvttss2sis {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl $-8388607, %eax # imm = 0xFF800001
-; X86-NEXT:    movl $-8388608, %ecx # imm = 0xFF800000
 ; X86-NEXT:    cmovgel %eax, %ecx
-; X86-NEXT:    cmpl $8388607, %ecx # imm = 0x7FFFFF
 ; X86-NEXT:    movl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    cmpl $8388607, %ecx # imm = 0x7FFFFF
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -41,21 +41,21 @@ define i48 @test_signed_i48_f32(float %f) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    vcvttps2qq %xmm1, %xmm1
-; X86-NEXT:    vmovd %xmm1, %esi
+; X86-NEXT:    vcvttps2qq %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    cmovbl %ecx, %esi
-; X86-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-NEXT:    movl $-32768, %edi # imm = 0x8000
 ; X86-NEXT:    cmovael %eax, %edi
-; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movl $32767, %edx # imm = 0x7FFF
+; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    cmovbel %edi, %edx
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    cmovbel %esi, %eax
-; X86-NEXT:    vucomiss %xmm0, %xmm0
+; X86-NEXT:    vucomiss %xmm1, %xmm1
 ; X86-NEXT:    cmovpl %ecx, %eax
 ; X86-NEXT:    cmovpl %ecx, %edx
 ; X86-NEXT:    popl %esi
@@ -99,21 +99,21 @@ define i64 @test_signed_i64_f32(float %f) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    vcvttps2qq %xmm1, %xmm1
-; X86-NEXT:    vmovd %xmm1, %esi
+; X86-NEXT:    vcvttps2qq %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    cmovbl %ecx, %esi
-; X86-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
 ; X86-NEXT:    cmovael %eax, %edi
-; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    cmovbel %edi, %edx
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    cmovbel %esi, %eax
-; X86-NEXT:    vucomiss %xmm0, %xmm0
+; X86-NEXT:    vucomiss %xmm1, %xmm1
 ; X86-NEXT:    cmovpl %ecx, %eax
 ; X86-NEXT:    cmovpl %ecx, %edx
 ; X86-NEXT:    popl %esi
@@ -138,12 +138,12 @@ declare  i48 @llvm.fptosi.sat.i48.f64 (double)
 define i24 @test_signed_i24_f64(double %f) nounwind {
 ; X86-LABEL: test_signed_i24_f64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $-8388608, %ecx # imm = 0xFF800000
 ; X86-NEXT:    vcvttsd2sis {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl $-8388607, %eax # imm = 0xFF800001
-; X86-NEXT:    movl $-8388608, %ecx # imm = 0xFF800000
 ; X86-NEXT:    cmovgel %eax, %ecx
-; X86-NEXT:    cmpl $8388607, %ecx # imm = 0x7FFFFF
 ; X86-NEXT:    movl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    cmpl $8388607, %ecx # imm = 0x7FFFFF
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -221,8 +221,8 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; X86-NEXT:    vpextrd $1, %xmm1, %eax
 ; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
 ; X86-NEXT:    cmovael %eax, %edi
-; X86-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    cmovbel %edi, %edx
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    cmovbel %esi, %eax
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index bf7f9375570f9..d0dd15e0065d5 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -570,13 +570,12 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_mask_mpsadbw_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmpsadbw $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xd9,0x02]
-; X86-NEXT:    vmpsadbw $3, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x09,0x42,0xe1,0x03]
+; X86-NEXT:    vmpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7e,0x09,0x42,0xd9,0x03]
 ; X86-NEXT:    vmpsadbw $4, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0x89,0x42,0xd1,0x04]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X86-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
+; X86-NEXT:    vmpsadbw $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x02]
+; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mask_mpsadbw_128:
@@ -604,13 +603,12 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0,
 define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_mask_mpsadbw_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vmpsadbw $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xd9,0x02]
-; X86-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x29,0x42,0xe1,0x03]
+; X86-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0x7e,0x29,0x42,0xd9,0x03]
 ; X86-NEXT:    vmpsadbw $4, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xa9,0x42,0xd1,0x04]
-; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X86-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
+; X86-NEXT:    vmpsadbw $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x02]
+; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mask_mpsadbw_256:
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
index 00db1fb07c78d..b5a51d6701579 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
@@ -3,12 +3,19 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define i32 @test_x86_avx512_vcvttsd2usis(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx512_vcvttsd2usis:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttsd2usis %xmm0, %ecx # encoding: [0x62,0xf5,0x7f,0x08,0x6c,0xc8]
-; CHECK-NEXT:    vcvttsd2usis {sae}, %xmm0, %eax # encoding: [0x62,0xf5,0x7f,0x18,0x6c,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-LABEL: test_x86_avx512_vcvttsd2usis:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsd2usis %xmm0, %ecx # encoding: [0x62,0xf5,0x7f,0x08,0x6c,0xc8]
+; X64-NEXT:    vcvttsd2usis {sae}, %xmm0, %eax # encoding: [0x62,0xf5,0x7f,0x18,0x6c,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_x86_avx512_vcvttsd2usis:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsd2usis {sae}, %xmm0, %ecx # encoding: [0x62,0xf5,0x7f,0x18,0x6c,0xc8]
+; X86-NEXT:    vcvttsd2usis %xmm0, %eax # encoding: [0x62,0xf5,0x7f,0x08,0x6c,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    retl # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx10.vcvttsd2usis(<2 x double> %a0, i32 4) ;
   %res1 = call i32 @llvm.x86.avx10.vcvttsd2usis(<2 x double> %a0, i32 8) ;
   %res2 = add i32 %res0, %res1
@@ -17,12 +24,19 @@ define i32 @test_x86_avx512_vcvttsd2usis(<2 x double> %a0) {
 declare i32 @llvm.x86.avx10.vcvttsd2usis(<2 x double>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_vcvttsd2sis(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx512_vcvttsd2sis:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttsd2sis %xmm0, %ecx # encoding: [0x62,0xf5,0x7f,0x08,0x6d,0xc8]
-; CHECK-NEXT:    vcvttsd2sis {sae}, %xmm0, %eax # encoding: [0x62,0xf5,0x7f,0x18,0x6d,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-LABEL: test_x86_avx512_vcvttsd2sis:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsd2sis %xmm0, %ecx # encoding: [0x62,0xf5,0x7f,0x08,0x6d,0xc8]
+; X64-NEXT:    vcvttsd2sis {sae}, %xmm0, %eax # encoding: [0x62,0xf5,0x7f,0x18,0x6d,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_x86_avx512_vcvttsd2sis:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsd2sis {sae}, %xmm0, %ecx # encoding: [0x62,0xf5,0x7f,0x18,0x6d,0xc8]
+; X86-NEXT:    vcvttsd2sis %xmm0, %eax # encoding: [0x62,0xf5,0x7f,0x08,0x6d,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    retl # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx10.vcvttsd2sis(<2 x double> %a0, i32 4) ;
   %res1 = call i32 @llvm.x86.avx10.vcvttsd2sis(<2 x double> %a0, i32 8) ;
   %res2 = add i32 %res0, %res1
@@ -31,12 +45,19 @@ define i32 @test_x86_avx512_vcvttsd2sis(<2 x double> %a0) {
 declare i32 @llvm.x86.avx10.vcvttsd2sis(<2 x double>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_vcvttss2sis(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx512_vcvttss2sis:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttss2sis {sae}, %xmm0, %ecx # encoding: [0x62,0xf5,0x7e,0x18,0x6d,0xc8]
-; CHECK-NEXT:    vcvttss2sis %xmm0, %eax # encoding: [0x62,0xf5,0x7e,0x08,0x6d,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-LABEL: test_x86_avx512_vcvttss2sis:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttss2sis {sae}, %xmm0, %ecx # encoding: [0x62,0xf5,0x7e,0x18,0x6d,0xc8]
+; X64-NEXT:    vcvttss2sis %xmm0, %eax # encoding: [0x62,0xf5,0x7e,0x08,0x6d,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_x86_avx512_vcvttss2sis:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttss2sis %xmm0, %ecx # encoding: [0x62,0xf5,0x7e,0x08,0x6d,0xc8]
+; X86-NEXT:    vcvttss2sis {sae}, %xmm0, %eax # encoding: [0x62,0xf5,0x7e,0x18,0x6d,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    retl # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx10.vcvttss2sis(<4 x float> %a0, i32 8) ;
   %res1 = call i32 @llvm.x86.avx10.vcvttss2sis(<4 x float> %a0, i32 4) ;
   %res2 = add i32 %res0, %res1
@@ -61,12 +82,19 @@ define i32 @test_x86_avx512_vcvttss2sis_load(ptr %a0) {
 }
 
 define i32 @test_x86_avx512_vcvttss2usis(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx512_vcvttss2usis:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttss2usis {sae}, %xmm0, %ecx # encoding: [0x62,0xf5,0x7e,0x18,0x6c,0xc8]
-; CHECK-NEXT:    vcvttss2usis %xmm0, %eax # encoding: [0x62,0xf5,0x7e,0x08,0x6c,0xc0]
-; CHECK-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X64-LABEL: test_x86_avx512_vcvttss2usis:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttss2usis {sae}, %xmm0, %ecx # encoding: [0x62,0xf5,0x7e,0x18,0x6c,0xc8]
+; X64-NEXT:    vcvttss2usis %xmm0, %eax # encoding: [0x62,0xf5,0x7e,0x08,0x6c,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_x86_avx512_vcvttss2usis:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttss2usis %xmm0, %ecx # encoding: [0x62,0xf5,0x7e,0x08,0x6c,0xc8]
+; X86-NEXT:    vcvttss2usis {sae}, %xmm0, %eax # encoding: [0x62,0xf5,0x7e,0x18,0x6c,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    retl # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx10.vcvttss2usis(<4 x float> %a0, i32 8) ;
   %res1 = call i32 @llvm.x86.avx10.vcvttss2usis(<4 x float> %a0, i32 4) ;
   %res2 = add i32 %res0, %res1
diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
index ee504e30302b1..0691ca8b3b25f 100644
--- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
+++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
@@ -129,14 +129,23 @@ define <4 x double> @test9(<4 x double> %a) {
 }
 
 define <4 x double> @test10(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: test10:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [-9.5E+0,u,-5.5E+0,-2.5E+0]
-; CHECK-NEXT:    vmovapd %ymm2, %ymm3
-; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm3 = (ymm0 * ymm3) + ymm1
-; CHECK-NEXT:    vfnmadd213pd {{.*#+}} ymm2 = -(ymm0 * ymm2) + ymm1
-; CHECK-NEXT:    vaddpd %ymm2, %ymm3, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test10:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovapd {{.*#+}} ymm2 = [-9.5E+0,u,-5.5E+0,-2.5E+0]
+; X86-NEXT:    vmovapd %ymm2, %ymm3
+; X86-NEXT:    vfnmadd213pd {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm1
+; X86-NEXT:    vfmadd213pd {{.*#+}} ymm2 = (ymm0 * ymm2) + ymm1
+; X86-NEXT:    vaddpd %ymm3, %ymm2, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test10:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovapd {{.*#+}} ymm2 = [-9.5E+0,u,-5.5E+0,-2.5E+0]
+; X64-NEXT:    vmovapd %ymm2, %ymm3
+; X64-NEXT:    vfmadd213pd {{.*#+}} ymm3 = (ymm0 * ymm3) + ymm1
+; X64-NEXT:    vfnmadd213pd {{.*#+}} ymm2 = -(ymm0 * ymm2) + ymm1
+; X64-NEXT:    vaddpd %ymm2, %ymm3, %ymm0
+; X64-NEXT:    retq
   %t0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> <double -95.00000e-01, double undef, double -55.00000e-01, double -25.00000e-01>, <4 x double> %b)
   %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> <double 95.00000e-01, double undef, double 55.00000e-01, double 25.00000e-01>, <4 x double> %b)
   %t2 = fadd <4 x double> %t0, %t1
@@ -157,13 +166,21 @@ define <4 x double> @test11(<4 x double> %a) {
 }
 
 define <4 x double> @test12(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: test12:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [-7.5E+0,-2.5E+0,-5.5E+0,-9.5E+0]
-; CHECK-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem
-; CHECK-NEXT:    vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
-; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test12:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovapd {{.*#+}} ymm2 = [-7.5E+0,-2.5E+0,-5.5E+0,-9.5E+0]
+; X86-NEXT:    vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
+; X86-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem
+; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test12:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovapd {{.*#+}} ymm2 = [-7.5E+0,-2.5E+0,-5.5E+0,-9.5E+0]
+; X64-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem
+; X64-NEXT:    vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2
+; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %t0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> <double 75.00000e-01, double 25.00000e-01, double 55.00000e-01, double 95.00000e-01>, <4 x double> <double -75.00000e-01, double undef, double -55.00000e-01, double -95.00000e-01>)
   %t1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %b, <4 x double> <double undef, double 25.00000e-01, double 55.00000e-01, double 95.00000e-01>, <4 x double> <double -75.00000e-01, double -25.00000e-01, double -55.00000e-01, double -95.00000e-01>)
   %t2 = fadd <4 x double> %t0, %t1
diff --git a/llvm/test/CodeGen/X86/avx2-gather.ll b/llvm/test/CodeGen/X86/avx2-gather.ll
index 4b77edefa820d..5b641b75fd5d6 100644
--- a/llvm/test/CodeGen/X86/avx2-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-gather.ll
@@ -97,8 +97,8 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(ptr %a1, <4 x i32> %idx, <4 x
 define <2 x i64> @test_mm_i32gather_epi32(ptr%a0, <2 x i64> %a1) {
 ; X86-LABEL: test_mm_i32gather_epi32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
@@ -123,9 +123,9 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr, <4 x i32>, <4 x i32>
 define <2 x double> @test_mm_i32gather_pd(ptr%a0, <2 x i64> %a1) {
 ; X86-LABEL: test_mm_i32gather_pd:
 ; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X86-NEXT:    vmovapd %xmm1, %xmm0
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index 0cbaa4f63a74c..f276854d2ccf4 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -389,9 +389,9 @@ define void @test_x86_avx_storeu_dq_256(ptr %a0, <32 x i8> %a1) {
   ; add operation forces the execution domain.
 ; X86-LABEL: test_x86_avx_storeu_dq_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; X86-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 6b994b0782e6f..f27ba5a2aab05 100644
--- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1062,30 +1062,30 @@ define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
 define <4 x i32> @test_x86_avx2_psllv_d_const() {
 ; X86-AVX-LABEL: test_x86_avx2_psllv_d_const:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,1,1,4294967295]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A]
-; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295]
+; X86-AVX-NEXT:    vpsllvd %xmm0, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0xc0]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,9,0,4294967295]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9]
-; X86-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1]
+; X86-AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0x0d,A,A,A,A]
+; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfe,0xc0]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_const:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [1,1,1,4294967295]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,1,1,4294967295]
+; X86-AVX512VL-NEXT:    vpsllvd %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc0]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,9,0,4294967295]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9]
-; X86-AVX512VL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
+; X86-AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0x0d,A,A,A,A]
+; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psllv_d_const:
@@ -1140,32 +1140,32 @@ define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
 define <8 x i32> @test_x86_avx2_psllv_d_256_const() {
 ; X86-AVX-LABEL: test_x86_avx2_psllv_d_256_const:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4294967295]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,9,0,4294967295,3,7,4294967295,0]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1]
+; X86-AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfe,0xc0]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4294967295]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,9,0,4294967295,3,7,4294967295,0]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; X86-AVX512VL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psllv_d_256_const:
@@ -1333,32 +1333,32 @@ define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
 define <4 x i32> @test_x86_avx2_psrlv_d_const() {
 ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_const:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,4,4,4294967295]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,9,0,4294967295]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1]
+; X86-AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfe,0xc0]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [2,9,0,4294967295]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [4,4,4,4294967295]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,4,4,4294967295]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [2,9,0,4294967295]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
+; X86-AVX512VL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_const:
@@ -1415,32 +1415,32 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
 define <8 x i32> @test_x86_avx2_psrlv_d_256_const() {
 ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256_const:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4294967295]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295]
+; X86-AVX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,9,0,4294967295,3,7,4294967295,0]
 ; X86-AVX-NEXT:    # encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A]
 ; X86-AVX-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1]
+; X86-AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfe,0xc0]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,4,4,4,4,4,4,4294967295]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295]
+; X86-AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,9,0,4294967295,3,7,4294967295,0]
 ; X86-AVX512VL-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A]
 ; X86-AVX512VL-NEXT:    # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; X86-AVX512VL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256_const:
@@ -2000,19 +2000,19 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <
 ;; gather with mask
 ; X86-AVX-LABEL: test_gather_mask:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vmovaps %ymm2, %ymm3 # encoding: [0xc5,0xfc,0x28,0xda]
-; X86-AVX-NEXT:    vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 # encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
+; X86-AVX-NEXT:    vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 # encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX-NEXT:    vmovups %ymm2, (%eax) # encoding: [0xc5,0xfc,0x11,0x10]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_gather_mask:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vmovaps %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; X86-AVX512VL-NEXT:    vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 # encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
+; X86-AVX512VL-NEXT:    vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 # encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX512VL-NEXT:    vmovups %ymm2, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2041,15 +2041,15 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <
 define <2 x i64> @test_mask_demanded_bits(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i1> %mask) {
 ; X86-AVX-LABEL: test_mask_demanded_bits:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vpsllq $63, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x73,0xf2,0x3f]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 # encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_mask_demanded_bits:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vpsllq $63, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xf2,0x3f]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512VL-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 # encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
index 2429536bdb15f..ce8b11259ef52 100644
--- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
@@ -9,10 +9,10 @@ declare <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr> %ptrs, i32 %align, <2 x i1
 define <2 x i32> @masked_gather_v2i32(ptr %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
 ; X86-LABEL: masked_gather_v2i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    vpgatherdd %xmm0, (,%xmm2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
@@ -58,10 +58,10 @@ entry:
 define <4 x i32> @masked_gather_v2i32_concat(ptr %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
 ; X86-LABEL: masked_gather_v2i32_concat:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    vpgatherdd %xmm0, (,%xmm2), %xmm1
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
@@ -110,10 +110,10 @@ declare <2 x float> @llvm.masked.gather.v2float(<2 x ptr> %ptrs, i32 %align, <2
 define <2 x float> @masked_gather_v2float(ptr %ptr, <2 x i1> %masks, <2 x float> %passthro) {
 ; X86-LABEL: masked_gather_v2float:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    vgatherdps %xmm0, (,%xmm2), %xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-NEXT:    retl
@@ -160,10 +160,10 @@ entry:
 define <4 x float> @masked_gather_v2float_concat(ptr %ptr, <2 x i1> %masks, <2 x float> %passthro) {
 ; X86-LABEL: masked_gather_v2float_concat:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    vgatherdps %xmm0, (,%xmm2), %xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-NEXT:    retl
@@ -335,10 +335,10 @@ declare <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %ptrs, i32 %align, <8 x i1
 define <8 x i32> @masked_gather_v8i32(ptr %ptr, <8 x i1> %masks, <8 x i32> %passthro) {
 ; X86-LABEL: masked_gather_v8i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa (%eax), %ymm2
+; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-NEXT:    vpgatherdd %ymm0, (,%ymm2), %ymm1
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 ; X86-NEXT:    retl
@@ -438,10 +438,10 @@ declare <8 x float> @llvm.masked.gather.v8float(<8 x ptr> %ptrs, i32 %align, <8
 define <8 x float> @masked_gather_v8float(ptr %ptr, <8 x i1> %masks, <8 x float> %passthro) {
 ; X86-LABEL: masked_gather_v8float:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps (%eax), %ymm2
+; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-NEXT:    vgatherdps %ymm0, (,%ymm2), %ymm1
 ; X86-NEXT:    vmovaps %ymm1, %ymm0
 ; X86-NEXT:    retl
@@ -541,10 +541,10 @@ declare <4 x i64> @llvm.masked.gather.v4i64(<4 x ptr> %ptrs, i32 %align, <4 x i1
 define <4 x i64> @masked_gather_v4i64(ptr %ptr, <4 x i1> %masks, <4 x i64> %passthro) {
 ; X86-LABEL: masked_gather_v4i64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vpslld $31, %xmm0, %xmm0
-; X86-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa (%eax), %xmm2
+; X86-NEXT:    vpslld $31, %xmm0, %xmm0
+; X86-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; X86-NEXT:    vpgatherdq %ymm0, (,%xmm2), %ymm1
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 ; X86-NEXT:    retl
@@ -609,10 +609,10 @@ declare <4 x double> @llvm.masked.gather.v4double(<4 x ptr> %ptrs, i32 %align, <
 define <4 x double> @masked_gather_v4double(ptr %ptr, <4 x i1> %masks, <4 x double> %passthro) {
 ; X86-LABEL: masked_gather_v4double:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vpslld $31, %xmm0, %xmm0
-; X86-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovapd (%eax), %xmm2
+; X86-NEXT:    vpslld $31, %xmm0, %xmm0
+; X86-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; X86-NEXT:    vgatherdpd %ymm0, (,%xmm2), %ymm1
 ; X86-NEXT:    vmovapd %ymm1, %ymm0
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/avx2-nontemporal.ll b/llvm/test/CodeGen/X86/avx2-nontemporal.ll
index cd16b30184482..bb723d32aa88c 100644
--- a/llvm/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/avx2-nontemporal.ll
@@ -9,29 +9,29 @@ define i32 @f(<8 x float> %A, ptr %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    vmovdqa 104(%ebp), %ymm3
-; X86-NEXT:    vmovdqa 72(%ebp), %ymm4
-; X86-NEXT:    vmovdqa 40(%ebp), %ymm5
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 136(%ebp), %edx
-; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 136(%ebp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT:    vmovntps %ymm0, (%ecx)
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    vmovntps %ymm0, (%edx)
+; X86-NEXT:    addl (%ecx), %eax
 ; X86-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm0
-; X86-NEXT:    addl (%edx), %eax
-; X86-NEXT:    vmovntdq %ymm0, (%ecx)
+; X86-NEXT:    vmovntdq %ymm0, (%edx)
+; X86-NEXT:    addl (%ecx), %eax
 ; X86-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    addl (%edx), %eax
-; X86-NEXT:    vmovntpd %ymm0, (%ecx)
-; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm5, %ymm0
-; X86-NEXT:    addl (%edx), %eax
-; X86-NEXT:    vmovntdq %ymm0, (%ecx)
-; X86-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm4, %ymm0
-; X86-NEXT:    addl (%edx), %eax
-; X86-NEXT:    vmovntdq %ymm0, (%ecx)
-; X86-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm0
-; X86-NEXT:    addl (%edx), %eax
-; X86-NEXT:    vmovntdq %ymm0, (%ecx)
+; X86-NEXT:    vmovntpd %ymm0, (%edx)
+; X86-NEXT:    addl (%ecx), %eax
+; X86-NEXT:    vmovdqa 40(%ebp), %ymm0
+; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vmovntdq %ymm0, (%edx)
+; X86-NEXT:    addl (%ecx), %eax
+; X86-NEXT:    vmovdqa 72(%ebp), %ymm0
+; X86-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vmovntdq %ymm0, (%edx)
+; X86-NEXT:    addl (%ecx), %eax
+; X86-NEXT:    vmovdqa 104(%ebp), %ymm0
+; X86-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vmovntdq %ymm0, (%edx)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index c50af6968f5bb..cf1ee285e6b8b 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1041,8 +1041,8 @@ define void @broadcast_v16i32(ptr %a, ptr %b) {
 ; X86-AVX2-LABEL: broadcast_v16i32:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vbroadcastss (%ecx), %ymm0
+; X86-AVX2-NEXT:    vbroadcastss (%eax), %ymm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovups %ymm0, 32(%eax)
 ; X86-AVX2-NEXT:    vmovups %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1059,8 +1059,8 @@ define void @broadcast_v16i32(ptr %a, ptr %b) {
 ; X86-AVX512VL-LABEL: broadcast_v16i32:
 ; X86-AVX512VL:       ## %bb.0:
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512VL-NEXT:    vbroadcastss (%ecx), %zmm0
+; X86-AVX512VL-NEXT:    vbroadcastss (%eax), %zmm0
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VL-NEXT:    vmovups %zmm0, (%eax)
 ; X86-AVX512VL-NEXT:    vzeroupper
 ; X86-AVX512VL-NEXT:    retl
@@ -1092,9 +1092,9 @@ define void @isel_crash_16b(ptr %cV_R.addr) {
 ; X86:       ## %bb.0: ## %eintry
 ; X86-NEXT:    subl $60, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 64
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpbroadcastb (%eax), %xmm1
 ; X86-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
@@ -1134,9 +1134,9 @@ define void @isel_crash_32b(ptr %cV_R.addr) {
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %ymm0, (%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vpbroadcastb (%eax), %ymm1
 ; X86-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
@@ -1183,9 +1183,9 @@ define void @isel_crash_8w(ptr %cV_R.addr) {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    subl $60, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 64
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpbroadcastw (%eax), %xmm1
 ; X86-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovdqa %xmm1, {{[0-9]+}}(%esp)
@@ -1225,9 +1225,9 @@ define void @isel_crash_16w(ptr %cV_R.addr) {
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %ymm0, (%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vpbroadcastw (%eax), %ymm1
 ; X86-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
@@ -1274,9 +1274,9 @@ define void @isel_crash_4d(ptr %cV_R.addr) {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    subl $60, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 64
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vbroadcastss (%eax), %xmm1
 ; X86-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
@@ -1316,9 +1316,9 @@ define void @isel_crash_8d(ptr %cV_R.addr) {
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %ymm0, (%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vbroadcastss (%eax), %ymm1
 ; X86-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
@@ -1365,9 +1365,9 @@ define void @isel_crash_2q(ptr %cV_R.addr) {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    subl $60, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 64
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; X86-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
@@ -1406,9 +1406,9 @@ define void @isel_crash_4q(ptr %cV_R.addr) {
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %ymm0, (%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vbroadcastsd (%eax), %ymm1
 ; X86-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll
index b5cf3b616f6a6..ecb0228a70edf 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcasti128.ll
@@ -120,10 +120,10 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_2f64_4f64_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    vmovapd %xmm1, (%eax)
+; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovapd %xmm0, (%eax)
+; X86-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_broadcast_2f64_4f64_reuse:
@@ -143,10 +143,10 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_2i64_4i64_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    vmovdqa %xmm1, (%eax)
+; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_broadcast_2i64_4i64_reuse:
@@ -166,10 +166,10 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_4f32_8f32_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    vmovaps %xmm1, (%eax)
+; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_broadcast_4f32_8f32_reuse:
@@ -189,10 +189,10 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) {
 ; X86-LABEL: test_broadcast_4i32_8i32_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    vmovdqa %xmm1, (%eax)
+; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_broadcast_4i32_8i32_reuse:
@@ -212,10 +212,10 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind {
 ; X86-LABEL: test_broadcast_8i16_16i16_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    vmovdqa %xmm1, (%eax)
+; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_broadcast_8i16_16i16_reuse:
@@ -235,10 +235,10 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind {
 ; X86-LABEL: test_broadcast_16i8_32i8_reuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
-; X86-NEXT:    vmovdqa %xmm1, (%eax)
+; X86-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_broadcast_16i8_32i8_reuse:
@@ -258,9 +258,9 @@ define <8 x i32> @PR29088(ptr %p0, ptr %p1) {
 ; X86-LABEL: PR29088:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %ymm1, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index 23b46ee59154f..285f17d475e26 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -383,8 +383,8 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond) {
 ;
 ; KNL_X32-LABEL: test10:
 ; KNL_X32:       ## %bb.0:
-; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; KNL_X32-NEXT:    cmovnel %eax, %ecx
 ; KNL_X32-NEXT:    movl (%ecx), %eax
@@ -456,33 +456,29 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
 ; KNL_X32:       ## %bb.0:
 ; KNL_X32-NEXT:    pushl %ebx
 ; KNL_X32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_X32-NEXT:    pushl %edi
-; KNL_X32-NEXT:    .cfi_def_cfa_offset 12
 ; KNL_X32-NEXT:    pushl %esi
-; KNL_X32-NEXT:    .cfi_def_cfa_offset 16
-; KNL_X32-NEXT:    subl $16, %esp
+; KNL_X32-NEXT:    .cfi_def_cfa_offset 12
+; KNL_X32-NEXT:    subl $20, %esp
 ; KNL_X32-NEXT:    .cfi_def_cfa_offset 32
-; KNL_X32-NEXT:    .cfi_offset %esi, -16
-; KNL_X32-NEXT:    .cfi_offset %edi, -12
+; KNL_X32-NEXT:    .cfi_offset %esi, -12
 ; KNL_X32-NEXT:    .cfi_offset %ebx, -8
-; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    movl %edi, (%esp)
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; KNL_X32-NEXT:    movl %esi, (%esp)
 ; KNL_X32-NEXT:    calll _test11
 ; KNL_X32-NEXT:    movl %eax, %ebx
 ; KNL_X32-NEXT:    movzbl %al, %eax
 ; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    movl %edi, (%esp)
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    movl %esi, (%esp)
 ; KNL_X32-NEXT:    calll _test10
 ; KNL_X32-NEXT:    xorl %ecx, %ecx
 ; KNL_X32-NEXT:    testb $1, %bl
 ; KNL_X32-NEXT:    cmovel %ecx, %eax
-; KNL_X32-NEXT:    addl $16, %esp
+; KNL_X32-NEXT:    addl $20, %esp
 ; KNL_X32-NEXT:    popl %esi
-; KNL_X32-NEXT:    popl %edi
 ; KNL_X32-NEXT:    popl %ebx
 ; KNL_X32-NEXT:    retl
 ;
@@ -1309,6 +1305,10 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    pushl %edi
 ; KNL_X32-NEXT:    pushl %esi
 ; KNL_X32-NEXT:    subl $16, %esp
+; KNL_X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; KNL_X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; KNL_X32-NEXT:    kandw %k0, %k1, %k0
+; KNL_X32-NEXT:    kmovw %k0, %ecx
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k0
@@ -1436,8 +1436,8 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kmovw %k5, (%esp) ## 2-byte Spill
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k5
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; KNL_X32-NEXT:    kmovw %edx, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL_X32-NEXT:    kshiftrw $14, %k5, %k5
 ; KNL_X32-NEXT:    kmovw %eax, %k6
@@ -1526,50 +1526,47 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kmovw %eax, %k1
 ; KNL_X32-NEXT:    kshiftlw $14, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $1, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $1, %k0, %k0
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    andl $1, %ecx
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    movb %cl, 2(%eax)
 ; KNL_X32-NEXT:    kmovw (%esp), %k1 ## 2-byte Reload
 ; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; KNL_X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k2
-; KNL_X32-NEXT:    kandw %k1, %k2, %k1
-; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %k1, %ebx
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %ebp
+; KNL_X32-NEXT:    kmovw %k1, %ebx
 ; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edi
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
+; KNL_X32-NEXT:    kmovw %k1, %esi
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ecx
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    movb %bl, 2(%eax)
-; KNL_X32-NEXT:    kmovw %k0, %ebx
-; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    kmovw %k0, %ebp
 ; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    leal (%ebx,%ebp,2), %ebx
+; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    leal (,%ebx,2), %ebx
+; KNL_X32-NEXT:    addl %ebp, %ebx
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
 ; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %edi
+; KNL_X32-NEXT:    leal (%ebx,%edi,4), %edi
+; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    leal (%ebx,%esi,4), %ebx
+; KNL_X32-NEXT:    leal (%edi,%esi,8), %edi
 ; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    leal (%ebx,%edi,8), %ebx
-; KNL_X32-NEXT:    kmovw %k1, %edi
 ; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $4, %edx
-; KNL_X32-NEXT:    orl %ebx, %edx
-; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    orl %edi, %edx
+; KNL_X32-NEXT:    kmovw %k1, %edi
 ; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $5, %ecx
@@ -1578,42 +1575,42 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebp
 ; KNL_X32-NEXT:    shll $6, %ebp
-; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    shll $7, %esi
-; KNL_X32-NEXT:    orl %ebp, %esi
+; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    shll $7, %ebx
+; KNL_X32-NEXT:    orl %ebp, %ebx
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
 ; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %esi
+; KNL_X32-NEXT:    shll $8, %esi
+; KNL_X32-NEXT:    orl %ebx, %esi
+; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    shll $8, %edi
+; KNL_X32-NEXT:    shll $9, %edi
 ; KNL_X32-NEXT:    orl %esi, %edi
 ; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    shll $9, %ebx
-; KNL_X32-NEXT:    orl %edi, %ebx
-; KNL_X32-NEXT:    kmovw %k1, %edi
 ; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $10, %edx
-; KNL_X32-NEXT:    orl %ebx, %edx
-; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    orl %edi, %edx
+; KNL_X32-NEXT:    kmovw %k1, %edi
 ; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL_X32-NEXT:    orl %ecx, %edx
 ; KNL_X32-NEXT:    kmovw %k0, %ecx
 ; KNL_X32-NEXT:    andl $1, %ebp
 ; KNL_X32-NEXT:    shll $11, %ebp
+; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    shll $12, %ebx
+; KNL_X32-NEXT:    orl %ebp, %ebx
 ; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    shll $12, %esi
-; KNL_X32-NEXT:    orl %ebp, %esi
+; KNL_X32-NEXT:    shll $13, %esi
+; KNL_X32-NEXT:    orl %ebx, %esi
 ; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    shll $13, %edi
+; KNL_X32-NEXT:    shll $14, %edi
 ; KNL_X32-NEXT:    orl %esi, %edi
-; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    shll $14, %ebx
-; KNL_X32-NEXT:    orl %edi, %ebx
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $15, %ecx
-; KNL_X32-NEXT:    orl %ebx, %ecx
+; KNL_X32-NEXT:    orl %edi, %ecx
 ; KNL_X32-NEXT:    orl %edx, %ecx
 ; KNL_X32-NEXT:    movw %cx, (%eax)
 ; KNL_X32-NEXT:    addl $16, %esp
@@ -2717,8 +2714,6 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ;
 ; KNL_X32-LABEL: test17:
 ; KNL_X32:       ## %bb.0:
-; KNL_X32-NEXT:    pushl %ebx
-; KNL_X32-NEXT:    subl $16, %esp
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k0
@@ -2726,409 +2721,393 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x
 ; KNL_X32-NEXT:    kmovw %eax, %k1
 ; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
 ; KNL_X32-NEXT:    kshiftrw $14, %k1, %k1
-; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    korw %k1, %k0, %k1
 ; KNL_X32-NEXT:    movw $-5, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k1
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k0
+; KNL_X32-NEXT:    kandw %k0, %k1, %k1
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k2
 ; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
 ; KNL_X32-NEXT:    kshiftrw $13, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k2, %k1, %k2
 ; KNL_X32-NEXT:    movw $-9, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kandw %k1, %k2, %k2
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k3
 ; KNL_X32-NEXT:    kshiftlw $15, %k3, %k3
 ; KNL_X32-NEXT:    kshiftrw $12, %k3, %k3
-; KNL_X32-NEXT:    korw %k3, %k0, %k0
+; KNL_X32-NEXT:    korw %k3, %k2, %k3
 ; KNL_X32-NEXT:    movw $-17, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k3
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k2
+; KNL_X32-NEXT:    kandw %k2, %k3, %k3
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k4
 ; KNL_X32-NEXT:    kshiftlw $15, %k4, %k4
 ; KNL_X32-NEXT:    kshiftrw $11, %k4, %k4
-; KNL_X32-NEXT:    korw %k4, %k0, %k0
+; KNL_X32-NEXT:    korw %k4, %k3, %k4
 ; KNL_X32-NEXT:    movw $-33, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k4
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k3
+; KNL_X32-NEXT:    kandw %k3, %k4, %k4
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k5
 ; KNL_X32-NEXT:    kshiftlw $15, %k5, %k5
 ; KNL_X32-NEXT:    kshiftrw $10, %k5, %k5
-; KNL_X32-NEXT:    korw %k5, %k0, %k0
+; KNL_X32-NEXT:    korw %k5, %k4, %k5
 ; KNL_X32-NEXT:    movw $-65, %ax
-; KNL_X32-NEXT:    kmovw %eax, %k5
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k4
+; KNL_X32-NEXT:    kandw %k4, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    korw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    kmovw %eax, %k0
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $14, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k5, %k6, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
 ; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    korw %k0, %k6, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $12, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $11, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $10, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k6
-; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
-; KNL_X32-NEXT:    kshiftrw $9, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k0, %k0
-; KNL_X32-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k0
-; KNL_X32-NEXT:    kshiftlw $15, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k0
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k7
-; KNL_X32-NEXT:    korw %k0, %k7, %k0
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
+; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
+; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k0, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $13, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k2, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k1, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $12, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k3, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k2, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $11, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k4, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k3, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $10, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
-; KNL_X32-NEXT:    kandw %k5, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k4, %k6, %k6
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $9, %k7, %k7
-; KNL_X32-NEXT:    korw %k7, %k0, %k0
+; KNL_X32-NEXT:    korw %k7, %k6, %k6
+; KNL_X32-NEXT:    kandw %k6, %k5, %k5
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andl $1, %eax
-; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; KNL_X32-NEXT:    kmovw %ecx, %k7
+; KNL_X32-NEXT:    kmovw %eax, %k6
+; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; KNL_X32-NEXT:    kmovw %eax, %k7
 ; KNL_X32-NEXT:    kshiftlw $15, %k7, %k7
 ; KNL_X32-NEXT:    kshiftrw $14, %k7, %k7
-; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    korw %k7, %k6, %k6
-; KNL_X32-NEXT:    kandw %k1, %k6, %k1
+; KNL_X32-NEXT:    kandw %k0, %k6, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k6
 ; KNL_X32-NEXT:    kshiftlw $15, %k6, %k6
 ; KNL_X32-NEXT:    kshiftrw $13, %k6, %k6
-; KNL_X32-NEXT:    korw %k6, %k1, %k1
-; KNL_X32-NEXT:    kandw %k2, %k1, %k1
+; KNL_X32-NEXT:    korw %k6, %k0, %k0
+; KNL_X32-NEXT:    kandw %k1, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $12, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k3, %k1, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $12, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k2, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $11, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k4, %k1, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $11, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k3, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $10, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k5, %k1, %k1
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $10, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k4, %k0, %k0
 ; KNL_X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; KNL_X32-NEXT:    kmovw %eax, %k2
-; KNL_X32-NEXT:    kshiftlw $15, %k2, %k2
-; KNL_X32-NEXT:    kshiftrw $9, %k2, %k2
-; KNL_X32-NEXT:    korw %k2, %k1, %k1
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload
-; KNL_X32-NEXT:    kandw %k1, %k0, %k0
-; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %ecx
-; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %eax
+; KNL_X32-NEXT:    kmovw %eax, %k1
+; KNL_X32-NEXT:    kshiftlw $15, %k1, %k1
+; KNL_X32-NEXT:    kshiftrw $9, %k1, %k1
+; KNL_X32-NEXT:    korw %k1, %k0, %k0
+; KNL_X32-NEXT:    kandw %k0, %k5, %k0
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %edx
-; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k0, %ebx
-; KNL_X32-NEXT:    andb $1, %bl
-; KNL_X32-NEXT:    andb $1, %dl
-; KNL_X32-NEXT:    addb %dl, %dl
-; KNL_X32-NEXT:    orb %bl, %dl
-; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
-; KNL_X32-NEXT:    andb $1, %bl
-; KNL_X32-NEXT:    shlb $2, %bl
-; KNL_X32-NEXT:    orb %dl, %bl
-; KNL_X32-NEXT:    kmovw %k1, %edx
-; KNL_X32-NEXT:    kshiftrw $4, %k0, %k0
+; KNL_X32-NEXT:    kshiftrw $2, %k0, %k2
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k3
+; KNL_X32-NEXT:    kshiftrw $4, %k0, %k4
+; KNL_X32-NEXT:    kshiftrw $5, %k0, %k5
+; KNL_X32-NEXT:    kshiftrw $6, %k0, %k6
+; KNL_X32-NEXT:    kmovw %k0, %eax
+; KNL_X32-NEXT:    andb $1, %al
+; KNL_X32-NEXT:    kmovw %k1, %ecx
+; KNL_X32-NEXT:    andb $1, %cl
+; KNL_X32-NEXT:    addb %cl, %cl
+; KNL_X32-NEXT:    orb %al, %cl
+; KNL_X32-NEXT:    kmovw %k2, %eax
+; KNL_X32-NEXT:    andb $1, %al
+; KNL_X32-NEXT:    shlb $2, %al
+; KNL_X32-NEXT:    orb %cl, %al
+; KNL_X32-NEXT:    kmovw %k3, %ecx
+; KNL_X32-NEXT:    andb $1, %cl
+; KNL_X32-NEXT:    shlb $3, %cl
+; KNL_X32-NEXT:    orb %al, %cl
+; KNL_X32-NEXT:    kmovw %k4, %edx
 ; KNL_X32-NEXT:    andb $1, %dl
-; KNL_X32-NEXT:    shlb $3, %dl
-; KNL_X32-NEXT:    orb %bl, %dl
-; KNL_X32-NEXT:    kmovw %k0, %ebx
-; KNL_X32-NEXT:    andb $1, %bl
-; KNL_X32-NEXT:    shlb $4, %bl
-; KNL_X32-NEXT:    orb %dl, %bl
+; KNL_X32-NEXT:    shlb $4, %dl
+; KNL_X32-NEXT:    orb %cl, %dl
+; KNL_X32-NEXT:    kmovw %k5, %eax
 ; KNL_X32-NEXT:    andb $1, %al
 ; KNL_X32-NEXT:    shlb $5, %al
-; KNL_X32-NEXT:    orb %bl, %al
+; KNL_X32-NEXT:    orb %dl, %al
+; KNL_X32-NEXT:    kmovw %k6, %ecx
 ; KNL_X32-NEXT:    shlb $6, %cl
 ; KNL_X32-NEXT:    orb %al, %cl
-; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    andb $127, %cl
+; KNL_X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_X32-NEXT:    movb %cl, (%eax)
-; KNL_X32-NEXT:    addl $16, %esp
-; KNL_X32-NEXT:    popl %ebx
 ; KNL_X32-NEXT:    retl $4
 ;
 ; FASTISEL-LABEL: test17:
@@ -3546,14 +3525,14 @@ define void @v2i1_mem(<128 x i32> %x, <2 x i1> %y) {
 ; KNL_X32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_X32-NEXT:    andl $-64, %esp
 ; KNL_X32-NEXT:    subl $384, %esp ## imm = 0x180
-; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm5
-; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm6
-; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm7
 ; KNL_X32-NEXT:    vmovaps 264(%ebp), %xmm4
 ; KNL_X32-NEXT:    vmovaps %xmm4, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm7, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm6, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm5, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
 ; KNL_X32-NEXT:    vmovaps %zmm4, (%esp)
 ; KNL_X32-NEXT:    calll _v2i1_mem_callee
@@ -3609,14 +3588,14 @@ define void @v4i1_mem(<128 x i32> %x, <4 x i1> %y) {
 ; KNL_X32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_X32-NEXT:    andl $-64, %esp
 ; KNL_X32-NEXT:    subl $384, %esp ## imm = 0x180
-; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm5
-; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm6
-; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm7
 ; KNL_X32-NEXT:    vmovaps 264(%ebp), %xmm4
 ; KNL_X32-NEXT:    vmovaps %xmm4, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm7, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm6, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm5, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
 ; KNL_X32-NEXT:    vmovaps %zmm4, (%esp)
 ; KNL_X32-NEXT:    calll _v4i1_mem_callee
@@ -3672,14 +3651,14 @@ define void @v8i1_mem(<128 x i32> %x, <8 x i1> %y) {
 ; KNL_X32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_X32-NEXT:    andl $-64, %esp
 ; KNL_X32-NEXT:    subl $384, %esp ## imm = 0x180
-; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm5
-; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm6
-; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm7
 ; KNL_X32-NEXT:    vmovaps 264(%ebp), %xmm4
 ; KNL_X32-NEXT:    vmovaps %xmm4, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm7, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm6, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm5, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
 ; KNL_X32-NEXT:    vmovaps %zmm4, (%esp)
 ; KNL_X32-NEXT:    calll _v8i1_mem_callee
@@ -3735,14 +3714,14 @@ define void @v16i1_mem(<128 x i32> %x, <16 x i1> %y) {
 ; KNL_X32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_X32-NEXT:    andl $-64, %esp
 ; KNL_X32-NEXT:    subl $384, %esp ## imm = 0x180
-; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm5
-; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm6
-; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm7
 ; KNL_X32-NEXT:    vmovaps 264(%ebp), %xmm4
 ; KNL_X32-NEXT:    vmovaps %xmm4, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm7, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm6, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm5, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
 ; KNL_X32-NEXT:    vmovaps %zmm4, (%esp)
 ; KNL_X32-NEXT:    calll _v16i1_mem_callee
@@ -3810,14 +3789,14 @@ define void @v32i1_mem(<128 x i32> %x, <32 x i1> %y) {
 ; KNL_X32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_X32-NEXT:    andl $-64, %esp
 ; KNL_X32-NEXT:    subl $384, %esp ## imm = 0x180
-; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm5
-; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm6
-; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm7
 ; KNL_X32-NEXT:    vmovaps 264(%ebp), %ymm4
 ; KNL_X32-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm7, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm6, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm5, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
 ; KNL_X32-NEXT:    vmovaps %zmm4, (%esp)
 ; KNL_X32-NEXT:    calll _v32i1_mem_callee
@@ -3999,10 +3978,6 @@ define void @v64i1_mem(<128 x i32> %x, <64 x i1> %y) {
 ; KNL_X32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_X32-NEXT:    andl $-64, %esp
 ; KNL_X32-NEXT:    subl $576, %esp ## imm = 0x240
-; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
-; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm5
-; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm6
-; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm7
 ; KNL_X32-NEXT:    movl 516(%ebp), %eax
 ; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl 512(%ebp), %eax
@@ -4128,12 +4103,16 @@ define void @v64i1_mem(<128 x i32> %x, <64 x i1> %y) {
 ; KNL_X32-NEXT:    movl 272(%ebp), %eax
 ; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl 268(%ebp), %eax
+; KNL_X32-NEXT:    vmovaps 200(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 136(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl 264(%ebp), %eax
+; KNL_X32-NEXT:    vmovaps 72(%ebp), %zmm4
+; KNL_X32-NEXT:    vmovaps %zmm4, {{[0-9]+}}(%esp)
 ; KNL_X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm7, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm6, {{[0-9]+}}(%esp)
-; KNL_X32-NEXT:    vmovaps %zmm5, {{[0-9]+}}(%esp)
+; KNL_X32-NEXT:    vmovaps 8(%ebp), %zmm4
 ; KNL_X32-NEXT:    vmovaps %zmm4, (%esp)
 ; KNL_X32-NEXT:    calll _v64i1_mem_callee
 ; KNL_X32-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
index 3b7803427ae5e..18aba7a6b1434 100644
--- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
@@ -9,11 +9,10 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 ; X86BW-LABEL: test_vgf2p8affineinvqb_128:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
-; X86BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x04]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_128:
@@ -28,16 +27,16 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_128:
 ; X86NOBW:       # %bb.0:
+; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x05]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
+; X86NOBW-NEXT:    vpternlogq $184, %xmm3, %xmm4, %xmm2 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xd3,0xb8]
+; X86NOBW-NEXT:    # xmm2 = xmm2 ^ (xmm4 & (xmm2 ^ xmm3))
 ; X86NOBW-NEXT:    vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd]
-; X86NOBW-NEXT:    vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc]
-; X86NOBW-NEXT:    vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8]
-; X86NOBW-NEXT:    # xmm2 = xmm2 ^ (xmm5 & (xmm2 ^ xmm0))
+; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x04]
+; X86NOBW-NEXT:    vpand %xmm0, %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xdb,0xc8]
 ; X86NOBW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
@@ -74,11 +73,10 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 ; X86BW-LABEL: test_vgf2p8affineinvqb_256:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
-; X86BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x04]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_256:
@@ -93,22 +91,22 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_256:
 ; X86NOBW:       # %bb.0:
+; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x05]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
 ; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01]
-; X86NOBW-NEXT:    vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc]
-; X86NOBW-NEXT:    vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8]
-; X86NOBW-NEXT:    # ymm2 = ymm2 ^ (ymm5 & (ymm2 ^ ymm0))
-; X86NOBW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x5d,0x38,0xe5,0x01]
+; X86NOBW-NEXT:    vpternlogq $184, %ymm3, %ymm4, %ymm2 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xd3,0xb8]
+; X86NOBW-NEXT:    # ymm2 = ymm2 ^ (ymm4 & (ymm2 ^ ymm3))
+; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x04]
+; X86NOBW-NEXT:    vpand %ymm3, %ymm4, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xdb,0xdb]
+; X86NOBW-NEXT:    vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x03]
+; X86NOBW-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8affineinvqb_256:
@@ -148,11 +146,10 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 ; X86BW-LABEL: test_vgf2p8affineinvqb_512:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
-; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
+; X86BW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x04]
+; X86BW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineinvqb_512:
@@ -167,32 +164,32 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineinvqb_512:
 ; X86NOBW:       # %bb.0:
-; X86NOBW-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x04]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x06]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xe1,0x04]
-; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xd9,0x05]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k4} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff]
-; X86NOBW-NEXT:    # zmm5 {%k3} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0a]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
+; X86NOBW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
 ; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff]
-; X86NOBW-NEXT:    # zmm6 {%k1} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01]
-; X86NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01]
-; X86NOBW-NEXT:    vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc]
-; X86NOBW-NEXT:    vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8]
-; X86NOBW-NEXT:    # zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm3))
+; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x5d,0x38,0xe5,0x01]
+; X86NOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x3a,0xdb,0x01]
+; X86NOBW-NEXT:    vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x05]
+; X86NOBW-NEXT:    vpternlogq $184, %zmm4, %zmm3, %zmm2 # encoding: [0x62,0xf3,0xe5,0x48,0x25,0xd4,0xb8]
+; X86NOBW-NEXT:    # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm4))
+; X86NOBW-NEXT:    vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x04]
+; X86NOBW-NEXT:    vpandq %zmm4, %zmm3, %zmm3 # encoding: [0x62,0xf1,0xe5,0x48,0xdb,0xdc]
+; X86NOBW-NEXT:    vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03]
+; X86NOBW-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8affineinvqb_512:
@@ -246,11 +243,10 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ; X86BW-LABEL: test_vgf2p8affineqb_128:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
-; X86BW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc]
+; X86BW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x04]
+; X86BW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_128:
@@ -265,16 +261,16 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_128:
 ; X86NOBW:       # %bb.0:
+; X86NOBW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x05]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
+; X86NOBW-NEXT:    vpternlogq $184, %xmm3, %xmm4, %xmm2 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xd3,0xb8]
+; X86NOBW-NEXT:    # xmm2 = xmm2 ^ (xmm4 & (xmm2 ^ xmm3))
 ; X86NOBW-NEXT:    vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X86NOBW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04]
-; X86NOBW-NEXT:    vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd]
-; X86NOBW-NEXT:    vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc]
-; X86NOBW-NEXT:    vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8]
-; X86NOBW-NEXT:    # xmm2 = xmm2 ^ (xmm5 & (xmm2 ^ xmm0))
+; X86NOBW-NEXT:    vgf2p8affineqb $4, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x04]
+; X86NOBW-NEXT:    vpand %xmm0, %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xdb,0xc8]
 ; X86NOBW-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86NOBW-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
@@ -311,11 +307,10 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ; X86BW-LABEL: test_vgf2p8affineqb_256:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
-; X86BW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc]
+; X86BW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x04]
+; X86BW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_256:
@@ -330,22 +325,22 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_256:
 ; X86NOBW:       # %bb.0:
+; X86NOBW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x05]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
-; X86NOBW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X86NOBW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04]
-; X86NOBW-NEXT:    vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
 ; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01]
-; X86NOBW-NEXT:    vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc]
-; X86NOBW-NEXT:    vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8]
-; X86NOBW-NEXT:    # ymm2 = ymm2 ^ (ymm5 & (ymm2 ^ ymm0))
-; X86NOBW-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
+; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x5d,0x38,0xe5,0x01]
+; X86NOBW-NEXT:    vpternlogq $184, %ymm3, %ymm4, %ymm2 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xd3,0xb8]
+; X86NOBW-NEXT:    # ymm2 = ymm2 ^ (ymm4 & (ymm2 ^ ymm3))
+; X86NOBW-NEXT:    vgf2p8affineqb $4, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x04]
+; X86NOBW-NEXT:    vpand %ymm3, %ymm4, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xdb,0xdb]
+; X86NOBW-NEXT:    vgf2p8affineqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x03]
+; X86NOBW-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8affineqb_256:
@@ -385,11 +380,10 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ; X86BW-LABEL: test_vgf2p8affineqb_512:
 ; X86BW:       # %bb.0:
 ; X86BW-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
-; X86BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X86BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
 ; X86BW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
-; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
-; X86BW-NEXT:    vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc]
+; X86BW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x04]
+; X86BW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03]
+; X86BW-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86BW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64BW-LABEL: test_vgf2p8affineqb_512:
@@ -404,32 +398,32 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
 ;
 ; X86NOBW-LABEL: test_vgf2p8affineqb_512:
 ; X86NOBW:       # %bb.0:
-; X86NOBW-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x04]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x06]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08]
-; X86NOBW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03]
-; X86NOBW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xe1,0x04]
-; X86NOBW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xd9,0x05]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k4} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff]
-; X86NOBW-NEXT:    # zmm5 {%k3} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0a]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
+; X86NOBW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
 ; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff]
-; X86NOBW-NEXT:    # zmm6 {%k1} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01]
-; X86NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01]
-; X86NOBW-NEXT:    vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc]
-; X86NOBW-NEXT:    vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8]
-; X86NOBW-NEXT:    # zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm3))
+; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x5d,0x38,0xe5,0x01]
+; X86NOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x3a,0xdb,0x01]
+; X86NOBW-NEXT:    vgf2p8affineqb $5, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x05]
+; X86NOBW-NEXT:    vpternlogq $184, %zmm4, %zmm3, %zmm2 # encoding: [0x62,0xf3,0xe5,0x48,0x25,0xd4,0xb8]
+; X86NOBW-NEXT:    # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm4))
+; X86NOBW-NEXT:    vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x04]
+; X86NOBW-NEXT:    vpandq %zmm4, %zmm3, %zmm3 # encoding: [0x62,0xf1,0xe5,0x48,0xdb,0xdc]
+; X86NOBW-NEXT:    vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03]
+; X86NOBW-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8affineqb_512:
@@ -505,8 +499,8 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16
 ;
 ; X86NOBW-LABEL: test_vgf2p8mulb_128_mask:
 ; X86NOBW:       # %bb.0:
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
 ; X86NOBW-NEXT:    # zmm0 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
@@ -547,8 +541,8 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1
 ;
 ; X86NOBW-LABEL: test_vgf2p8mulb_128_maskz:
 ; X86NOBW:       # %bb.0:
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
 ; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
@@ -599,9 +593,9 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32
 ;
 ; X86NOBW-LABEL: test_vgf2p8mulb_256_mask:
 ; X86NOBW:       # %bb.0:
+; X86NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
-; X86NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff]
 ; X86NOBW-NEXT:    # zmm0 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
@@ -650,9 +644,9 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3
 ;
 ; X86NOBW-LABEL: test_vgf2p8mulb_256_maskz:
 ; X86NOBW:       # %bb.0:
+; X86NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
-; X86NOBW-NEXT:    vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff]
 ; X86NOBW-NEXT:    # zmm1 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
@@ -711,28 +705,28 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64
 ;
 ; X86NOBW-LABEL: test_vgf2p8mulb_512_mask:
 ; X86NOBW:       # %bb.0:
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08]
-; X86NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff]
-; X86NOBW-NEXT:    # zmm0 {%k4} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff]
-; X86NOBW-NEXT:    # zmm3 {%k3} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
 ; X86NOBW-NEXT:    # zmm3 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff]
-; X86NOBW-NEXT:    # zmm4 {%k2} {z} = -1
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0a]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01]
-; X86NOBW-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01]
-; X86NOBW-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca]
-; X86NOBW-NEXT:    # zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2))
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0x5d,0xc9,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff]
+; X86NOBW-NEXT:    # zmm5 {%k2} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed]
+; X86NOBW-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x5d,0x38,0xe5,0x01]
+; X86NOBW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x3a,0xdb,0x01]
+; X86NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1]
+; X86NOBW-NEXT:    vpternlogq $226, %zmm2, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x25,0xc2,0xe2]
+; X86NOBW-NEXT:    # zmm0 = zmm2 ^ (zmm3 & (zmm0 ^ zmm2))
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8mulb_512_mask:
@@ -786,27 +780,27 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6
 ;
 ; X86NOBW-LABEL: test_vgf2p8mulb_512_maskz:
 ; X86NOBW:       # %bb.0:
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k3 # encoding: [0xc5,0xf8,0x90,0x5c,0x24,0x0a]
-; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08]
-; X86NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff]
-; X86NOBW-NEXT:    # zmm1 {%k4} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff]
-; X86NOBW-NEXT:    # zmm2 {%k3} {z} = -1
-; X86NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
-; X86NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86NOBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff]
 ; X86NOBW-NEXT:    # zmm2 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2]
-; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff]
-; X86NOBW-NEXT:    # zmm3 {%k2} {z} = -1
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0a]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k1} {z} = -1
 ; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
 ; X86NOBW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01]
-; X86NOBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01]
-; X86NOBW-NEXT:    vpandq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xdb,0xc0]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86NOBW-NEXT:    kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff]
+; X86NOBW-NEXT:    # zmm3 {%k1} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb]
+; X86NOBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff]
+; X86NOBW-NEXT:    # zmm4 {%k2} {z} = -1
+; X86NOBW-NEXT:    vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4]
+; X86NOBW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01]
+; X86NOBW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xd2,0x01]
+; X86NOBW-NEXT:    vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1]
+; X86NOBW-NEXT:    vpandq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xdb,0xc0]
 ; X86NOBW-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64NOBW-LABEL: test_vgf2p8mulb_512_maskz:
diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
index eb8bff26f3b77..0a7811599bf42 100644
--- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -20,9 +20,9 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-64, %esp
 ; X32-NEXT:    subl $192, %esp
-; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X32-NEXT:    calll _func_float16_ptr
 ; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
 ; X32-NEXT:    movl %ebp, %esp
@@ -35,8 +35,8 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-64, %esp
 ; WIN32-NEXT:    subl $128, %esp
-; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; WIN32-NEXT:    movl %esp, %eax
+; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
@@ -99,9 +99,9 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; X32-NEXT:    andl $-64, %esp
 ; X32-NEXT:    subl $256, %esp ## imm = 0x100
 ; X32-NEXT:    vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill
-; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X32-NEXT:    calll _func_float16_ptr
 ; X32-NEXT:    vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload
 ; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
@@ -116,8 +116,8 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-NEXT:    andl $-64, %esp
 ; WIN32-NEXT:    subl $192, %esp
 ; WIN32-NEXT:    vmovaps %zmm1, (%esp) # 64-byte Spill
-; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index eea4261ea30fe..a85e5aae49e8e 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -3238,13 +3238,21 @@ entry:
 }
 
 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
-; CHECK-LABEL: test_mm512_fnmsub_round_pd:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test_mm512_fnmsub_round_pd:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vpxorq %zmm3, %zmm2, %zmm2
+; X86-NEXT:    vpxorq %zmm3, %zmm0, %zmm0
+; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm512_fnmsub_round_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
+; X64-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
+; X64-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
+; X64-NEXT:    retq
 entry:
   %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
@@ -3469,13 +3477,21 @@ entry:
 }
 
 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
-; CHECK-LABEL: test_mm512_fnmsub_pd:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test_mm512_fnmsub_pd:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vpxorq %zmm3, %zmm2, %zmm2
+; X86-NEXT:    vpxorq %zmm3, %zmm0, %zmm0
+; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm512_fnmsub_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
+; X64-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
+; X64-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
+; X64-NEXT:    retq
 entry:
   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
   %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
@@ -3702,13 +3718,21 @@ entry:
 }
 
 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
-; CHECK-LABEL: test_mm512_fnmsub_round_ps:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test_mm512_fnmsub_round_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vpxord %zmm3, %zmm2, %zmm2
+; X86-NEXT:    vpxord %zmm3, %zmm0, %zmm0
+; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm512_fnmsub_round_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT:    vpxord %zmm3, %zmm0, %zmm4
+; X64-NEXT:    vpxord %zmm3, %zmm2, %zmm0
+; X64-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
+; X64-NEXT:    retq
 entry:
   %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
@@ -3933,13 +3957,21 @@ entry:
 }
 
 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
-; CHECK-LABEL: test_mm512_fnmsub_ps:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test_mm512_fnmsub_ps:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vpxord %zmm3, %zmm2, %zmm2
+; X86-NEXT:    vpxord %zmm3, %zmm0, %zmm0
+; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mm512_fnmsub_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT:    vpxord %zmm3, %zmm0, %zmm4
+; X64-NEXT:    vpxord %zmm3, %zmm2, %zmm0
+; X64-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
+; X64-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
   %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
@@ -6590,15 +6622,15 @@ define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
 ; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
-; X86-NEXT:    vpsrlq $32, %xmm0, %xmm3
-; X86-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
-; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
-; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm1
+; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; X86-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; X86-NEXT:    vpsllq $32, %xmm1, %xmm1
+; X86-NEXT:    vpmuludq %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-NEXT:    vzeroupper
@@ -6795,15 +6827,15 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
-; X86-NEXT:    vpsrlq $32, %xmm0, %xmm3
-; X86-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
-; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
-; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm1
+; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; X86-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; X86-NEXT:    vpsllq $32, %xmm1, %xmm1
+; X86-NEXT:    vpmuludq %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 5a0a9b7c3c25c..f2452c5419e45 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -7,9 +7,9 @@ declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
 ; X86-LABEL: unpckbw_test:
 ; X86:       ## %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04]
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    kunpckbw %k1, %k0, %k0 ## encoding: [0xc5,0xfd,0x4b,0xc1]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    kunpckbw %k0, %k1, %k0 ## encoding: [0xc5,0xf5,0x4b,0xc0]
 ; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -58,9 +58,9 @@ declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i
 define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x9c,0x24,0x04,0x00,0x00,0x00]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x9c,0x24,0x04,0x00,0x00,0x00]
 ; X86-NEXT:    vpblendmq %zmm3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0xcb]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
@@ -534,10 +534,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i
 define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) {
 ; X86-LABEL: test_store1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovups %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x11,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovups %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x11,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovups %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -559,11 +559,11 @@ declare void @llvm.x86.avx512.mask.storeu.ps.512(ptr, <16 x float>, i16 )
 define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) {
 ; X86-LABEL: test_store2:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovupd %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x11,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovupd %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x11,0x01]
 ; X86-NEXT:    vmovupd %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -585,10 +585,10 @@ declare void @llvm.x86.avx512.mask.storeu.pd.512(ptr, <8 x double>, i8)
 define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask) {
 ; X86-LABEL: test_mask_store_aligned_ps:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovaps %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x29,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x29,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovaps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -610,11 +610,11 @@ declare void @llvm.x86.avx512.mask.store.ps.512(ptr, <16 x float>, i16 )
 define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask) {
 ; X86-LABEL: test_mask_store_aligned_pd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovapd %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x29,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovapd %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x29,0x01]
 ; X86-NEXT:    vmovapd %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -636,11 +636,11 @@ declare void @llvm.x86.avx512.mask.store.pd.512(ptr, <8 x double>, i8)
 define void at test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_q_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu64 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x7f,0x01]
 ; X86-NEXT:    vmovdqu64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -662,10 +662,10 @@ declare void @llvm.x86.avx512.mask.storeu.q.512(ptr, <8 x i64>, i8)
 define void at test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_d_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu32 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu32 %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqu64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -687,11 +687,11 @@ declare void @llvm.x86.avx512.mask.storeu.d.512(ptr, <16 x i32>, i16)
 define void at test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_q_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqa64 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x7f,0x01]
 ; X86-NEXT:    vmovdqa64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -713,10 +713,10 @@ declare void @llvm.x86.avx512.mask.store.q.512(ptr, <8 x i64>, i8)
 define void at test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_d_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqa32 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa32 %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -794,10 +794,10 @@ declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr, <16 x float>, i16)
 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 %mask) {
 ; X86-LABEL: test_mask_load_aligned_pd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovapd (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x00]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vmovapd (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x00]
 ; X86-NEXT:    vmovapd (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x08]
 ; X86-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
@@ -823,10 +823,10 @@ declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr, <8 x double>, i8)
 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_pd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovupd (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x00]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vmovupd (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x00]
 ; X86-NEXT:    vmovupd (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x08]
 ; X86-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
@@ -854,12 +854,12 @@ declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr, <16 x i32>, i16)
 define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %data, i16 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_d:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu32 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x00]
-; X86-NEXT:    vmovdqu32 (%ecx), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x09]
+; X86-NEXT:    vmovdqu32 (%ecx), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x01]
+; X86-NEXT:    vmovdqu32 (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x08]
 ; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -883,13 +883,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr, <8 x i64>, i8)
 define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_q:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu64 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x00]
-; X86-NEXT:    vmovdqu64 (%ecx), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x09]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x01]
+; X86-NEXT:    vmovdqu64 (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x08]
 ; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -941,10 +941,10 @@ declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr, <8 x i64>, i8)
 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask) {
 ; X86-LABEL: test_mask_load_aligned_q:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovdqa64 (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x00]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vmovdqa64 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x00]
 ; X86-NEXT:    vmovdqa64 (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x08]
 ; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
@@ -2432,9 +2432,9 @@ define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi64_rmk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -2453,9 +2453,9 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %p
 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi64_rmkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -2490,9 +2490,9 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi64_rmbk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -2513,9 +2513,9 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi64_rmbkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -2597,9 +2597,9 @@ define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi64_rmk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -2618,9 +2618,9 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %p
 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi64_rmkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -2655,9 +2655,9 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi64_rmbk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -2678,9 +2678,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi64_rmbkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -4241,8 +4241,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psrl_qi_512
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpsrlq $4, %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf1,0xe5,0x49,0x73,0xd0,0x04]
-; X86-NEXT:    vpsrlq $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x73,0xd0,0x05]
 ; X86-NEXT:    vpsrlq $6, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xc9,0x73,0xd0,0x06]
+; X86-NEXT:    vpsrlq $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x73,0xd0,0x05]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -4272,8 +4272,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psrl_di_
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpsrld $4, %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf1,0x65,0x49,0x72,0xd0,0x04]
-; X86-NEXT:    vpsrld $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0x72,0xd0,0x05]
 ; X86-NEXT:    vpsrld $6, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xc9,0x72,0xd0,0x06]
+; X86-NEXT:    vpsrld $5, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0x72,0xd0,0x05]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -5371,9 +5371,9 @@ define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -5392,9 +5392,9 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -5430,9 +5430,9 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -5454,9 +5454,9 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64>
 define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbk_buildvector:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -5484,9 +5484,9 @@ define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b
 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -5506,9 +5506,9 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)
 define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbkz_buildvector:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -5597,9 +5597,9 @@ define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -5618,9 +5618,9 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -5659,12 +5659,12 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmbk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
 ; X86-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A]
 ; X86-NEXT:    ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -5686,12 +5686,12 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64>
 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmbkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
 ; X86-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A]
 ; X86-NEXT:    ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -5984,31 +5984,57 @@ define <8 x i64> @test_x86_avx512_movntdqa(ptr %a0) {
 declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly
 
 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
-; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8]
-; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05]
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
-; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
-; CHECK-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-LABEL: test_cmp_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01]
+; X86-NEXT:    vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04]
+; X86-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05]
+; X86-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X86-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; X64-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8]
+; X64-NEXT:    vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02]
+; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05]
+; X64-NEXT:    vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X64-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
@@ -6033,24 +6059,24 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1]
-; X86-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0]
-; X86-NEXT:    vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02]
-; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05]
-; X86-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9]
-; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
 ; X86-NEXT:    kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2]
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01]
+; X86-NEXT:    vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04]
+; X86-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05]
+; X86-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06]
 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -6101,31 +6127,57 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)
 declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
-; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01]
-; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05]
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
-; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
-; CHECK-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-LABEL: test_ucmp_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01]
+; X86-NEXT:    vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04]
+; X86-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05]
+; X86-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X86-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; X64-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01]
+; X64-NEXT:    vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02]
+; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05]
+; X64-NEXT:    vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X64-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
@@ -6150,24 +6202,24 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1]
-; X86-NEXT:    vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01]
-; X86-NEXT:    vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02]
-; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05]
-; X86-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06]
-; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
 ; X86-NEXT:    kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2]
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01]
+; X86-NEXT:    vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04]
+; X86-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05]
+; X86-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06]
 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -6218,31 +6270,57 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
 declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
 
 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf2,0xf5,0x48,0x37,0xc0]
-; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x02]
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x04]
-; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd9,0x05]
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xe1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-LABEL: test_cmp_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf2,0xf5,0x48,0x37,0xc0]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movzbl %cl, %ecx ## encoding: [0x0f,0xb6,0xc9]
+; X86-NEXT:    vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf2,0xf5,0x48,0x37,0xc0]
+; X64-NEXT:    vpcmpleq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x02]
+; X64-NEXT:    vpcmpneqq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x04]
+; X64-NEXT:    vpcmpnltq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd9,0x05]
+; X64-NEXT:    vpcmpgtq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xe1]
+; X64-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0]
+; X64-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
@@ -6267,25 +6345,25 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
-; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; X86-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xc0]
-; X86-NEXT:    vpcmpleq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd1,0x02]
-; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x04]
-; X86-NEXT:    vpcmpnltq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x05]
-; X86-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
 ; X86-NEXT:    kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movzbl %cl, %ecx ## encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01]
-; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
-; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
-; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
-; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT:    movzbl %dl, %edx ## encoding: [0x0f,0xb6,0xd2]
+; X86-NEXT:    vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01]
+; X86-NEXT:    vpcmpleq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02]
+; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04]
+; X86-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05]
+; X86-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -6337,31 +6415,57 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc9,0x02]
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x04]
-; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xd9,0x05]
-; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe1,0x06]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-LABEL: test_ucmp_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movzbl %cl, %ecx ## encoding: [0x0f,0xb6,0xc9]
+; X86-NEXT:    vmovd %ecx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc1,0x01]
+; X64-NEXT:    vpcmpleuq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc9,0x02]
+; X64-NEXT:    vpcmpneqq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x04]
+; X64-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xd9,0x05]
+; X64-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe1,0x06]
+; X64-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0]
+; X64-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
@@ -6386,25 +6490,25 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
-; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; X86-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x01]
-; X86-NEXT:    vpcmpleuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x02]
-; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x04]
-; X86-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xe1,0x05]
-; X86-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
 ; X86-NEXT:    kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movzbl %cl, %ecx ## encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01]
-; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
-; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
-; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
-; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT:    movzbl %dl, %edx ## encoding: [0x0f,0xb6,0xd2]
+; X86-NEXT:    vmovd %edx, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01]
+; X86-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02]
+; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04]
+; X86-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05]
+; X86-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -6465,9 +6569,9 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0,
 ; X86-NEXT:    ## zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0xc8]
-; X86-NEXT:    vmovaps %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xd0]
+; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc9]
+; X86-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xc0]
 ; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
-; X86-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
@@ -6568,9 +6672,9 @@ define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x
 define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr, <8 x double> %x2, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vbroadcastf64x4 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x1b,0x00]
 ; X86-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -6697,9 +6801,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8
 define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8 x i64> %x2, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vbroadcasti64x4 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x5b,0x00]
 ; X86-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -7021,15 +7125,25 @@ define i16 @test_kxor(i16 %a0, i16 %a1) {
 
 declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
 define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) {
-; CHECK-LABEL: test_kortestz:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
-; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xcb,0x04]
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    kortestw %k1, %k0 ## encoding: [0xc5,0xf8,0x98,0xc1]
-; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-LABEL: test_kortestz:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xc3,0x04]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; X86-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; X86-NEXT:    kortestw %k0, %k1 ## encoding: [0xc5,0xf8,0x98,0xc8]
+; X86-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kortestz:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
+; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xcb,0x04]
+; X64-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; X64-NEXT:    kortestw %k1, %k0 ## encoding: [0xc5,0xf8,0x98,0xc1]
+; X64-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
 entry:
   %0 = bitcast <8 x i64> %A to <16 x i32>
   %1 = bitcast <8 x i64> %B to <16 x i32>
@@ -7045,15 +7159,25 @@ entry:
 
 declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
 define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) {
-; CHECK-LABEL: test_kortestc:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
-; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xcb,0x04]
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    kortestw %k1, %k0 ## encoding: [0xc5,0xf8,0x98,0xc1]
-; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-LABEL: test_kortestc:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xc3,0x04]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; X86-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; X86-NEXT:    kortestw %k0, %k1 ## encoding: [0xc5,0xf8,0x98,0xc8]
+; X86-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kortestc:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
+; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xcb,0x04]
+; X64-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; X64-NEXT:    kortestw %k1, %k0 ## encoding: [0xc5,0xf8,0x98,0xc1]
+; X64-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
 entry:
   %0 = bitcast <8 x i64> %A to <16 x i32>
   %1 = bitcast <8 x i64> %B to <16 x i32>
@@ -7161,9 +7285,9 @@ define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mul_epi32_rmk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -7184,9 +7308,9 @@ define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passT
 define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mul_epi32_rmkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -7224,9 +7348,9 @@ define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mul_epi32_rmbk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -7250,9 +7374,9 @@ define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass
 define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mul_epi32_rmbkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -7341,9 +7465,9 @@ define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mul_epu32_rmk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x08]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -7364,9 +7488,9 @@ define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passT
 define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mul_epu32_rmkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -7407,12 +7531,12 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b) {
 define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mul_epu32_rmbk:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
 ; X86-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x58,0xdb,0x15,A,A,A,A]
 ; X86-NEXT:    ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -7436,12 +7560,12 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass
 define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mul_epu32_rmbkz:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
 ; X86-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x58,0xdb,0x0d,A,A,A,A]
 ; X86-NEXT:    ## fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -7942,9 +8066,9 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xd9,0x77,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -8891,9 +9015,9 @@ declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>
 define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_pd_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vcompresspd %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -8913,8 +9037,8 @@ declare void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double>
 define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data) {
 ; X86-LABEL: test_compress_store_pd_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompresspd %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -8932,8 +9056,8 @@ define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data) {
 define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 %mask) {
 ; X86-LABEL: test_mask_compress_store_ps_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompressps %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x8a,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -8953,8 +9077,8 @@ declare void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float>
 define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data) {
 ; X86-LABEL: test_compress_store_ps_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompressps %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x8a,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -8972,9 +9096,9 @@ define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data) {
 define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_q_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpcompressq %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -8994,8 +9118,8 @@ declare void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %da
 define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data) {
 ; X86-LABEL: test_compress_store_q_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressq %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -9013,8 +9137,8 @@ define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data) {
 define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %mask) {
 ; X86-LABEL: test_mask_compress_store_d_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressd %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x8b,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -9034,8 +9158,8 @@ declare void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %d
 define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data) {
 ; X86-LABEL: test_compress_store_d_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressd %zmm0, (%eax) {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x8b,0x00]
 ; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -9053,9 +9177,9 @@ define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data) {
 define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_pd_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -9071,9 +9195,9 @@ define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data,
 define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_pd_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -9091,8 +9215,8 @@ declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x do
 define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data) {
 ; X86-LABEL: test_expand_load_pd_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -9170,9 +9294,9 @@ define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data) {
 define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_q_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -9188,9 +9312,9 @@ define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %ma
 define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_q_512:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x89,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -9208,8 +9332,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %
 define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data) {
 ; X86-LABEL: test_expand_load_q_512:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -9932,14 +10056,14 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>,
 define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8]
+; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8]
-; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xda]
-; X86-NEXT:    ## xmm3 {%k1} = (xmm1 * xmm3) + xmm2
 ; X86-NEXT:    vmovapd %xmm0, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe0]
-; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0xf5,0x78,0xa9,0xe2]
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xe2]
+; X86-NEXT:    ## xmm4 {%k1} = (xmm1 * xmm4) + xmm2
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd9,0x58,0xdb]
 ; X86-NEXT:    vfmadd213sd {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa9,0xc2]
 ; X86-NEXT:    vaddpd %xmm3, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -9969,14 +10093,14 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps %xmm0, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovaps %xmm0, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xda]
-; X86-NEXT:    ## xmm3 {%k1} = (xmm1 * xmm3) + xmm2
 ; X86-NEXT:    vmovaps %xmm0, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe0]
-; X86-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf2,0x75,0x78,0xa9,0xe2]
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa9,0xe2]
+; X86-NEXT:    ## xmm4 {%k1} = (xmm1 * xmm4) + xmm2
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd8,0x58,0xdb]
 ; X86-NEXT:    vfmadd213ss {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa9,0xc2]
 ; X86-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10009,10 +10133,10 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vmovapd %xmm0, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xd8]
-; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xda]
-; X86-NEXT:    ## xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
-; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xf9,0xa9,0xc2]
-; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0xc5,0xe1,0x58,0xc0]
+; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xf9,0xa9,0xda]
+; X86-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
+; X86-NEXT:    ## xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
+; X86-NEXT:    vaddpd %xmm3, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
@@ -10057,14 +10181,14 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda]
+; X86-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda]
-; X86-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd9]
-; X86-NEXT:    ## xmm3 {%k1} = (xmm0 * xmm1) + xmm3
 ; X86-NEXT:    vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
-; X86-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xb9,0xe1]
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xe1]
+; X86-NEXT:    ## xmm4 {%k1} = (xmm0 * xmm1) + xmm4
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd9,0x58,0xdb]
 ; X86-NEXT:    vfmadd231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0xb9,0xd1]
 ; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10094,14 +10218,14 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda]
+; X86-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda]
-; X86-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd9]
-; X86-NEXT:    ## xmm3 {%k1} = (xmm0 * xmm1) + xmm3
 ; X86-NEXT:    vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
-; X86-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xb9,0xe1]
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xe1]
+; X86-NEXT:    ## xmm4 {%k1} = (xmm0 * xmm1) + xmm4
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd8,0x58,0xdb]
 ; X86-NEXT:    vfmadd231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x59,0xb9,0xd1]
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0xc5,0xe8,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10130,17 +10254,17 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_ss_mask_memfold:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02]
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x09]
 ; X86-NEXT:    vfmadd213ss %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xc8]
 ; X86-NEXT:    ## xmm1 = (xmm0 * xmm1) + xmm0
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1]
-; X86-NEXT:    vmovss %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x02]
+; X86-NEXT:    vmovss %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: fmadd_ss_mask_memfold:
@@ -10178,15 +10302,15 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_ss_maskz_memfold:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02]
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    vfmadd231ss (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x01]
 ; X86-NEXT:    ## xmm0 = (xmm0 * mem) + xmm0
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc0]
-; X86-NEXT:    vmovss %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x02]
+; X86-NEXT:    vmovss %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: fmadd_ss_maskz_memfold:
@@ -10222,17 +10346,17 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_sd_mask_memfold:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02]
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x09]
 ; X86-NEXT:    vfmadd213sd %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xc8]
 ; X86-NEXT:    ## xmm1 = (xmm0 * xmm1) + xmm0
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1]
-; X86-NEXT:    vmovsd %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x02]
+; X86-NEXT:    vmovsd %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: fmadd_sd_mask_memfold:
@@ -10266,15 +10390,15 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_sd_maskz_memfold:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02]
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    vfmadd231sd (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x01]
 ; X86-NEXT:    ## xmm0 = (xmm0 * mem) + xmm0
-; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x10,0xc0]
-; X86-NEXT:    vmovsd %xmm0, (%edx) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x02]
+; X86-NEXT:    vmovsd %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-LABEL: fmadd_sd_maskz_memfold:
@@ -10307,14 +10431,14 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda]
+; X86-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda]
-; X86-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd9]
-; X86-NEXT:    ## xmm3 {%k1} = (xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
-; X86-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbb,0xe1]
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xe1]
+; X86-NEXT:    ## xmm4 {%k1} = (xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd9,0x58,0xdb]
 ; X86-NEXT:    vfmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0xbb,0xd1]
 ; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10344,14 +10468,14 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda]
+; X86-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda]
-; X86-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd9]
-; X86-NEXT:    ## xmm3 {%k1} = (xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
-; X86-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbb,0xe1]
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT:    vfmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xe1]
+; X86-NEXT:    ## xmm4 {%k1} = (xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd8,0x58,0xdb]
 ; X86-NEXT:    vfmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x59,0xbb,0xd1]
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0xc5,0xe8,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10381,14 +10505,14 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double
 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda]
+; X86-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %xmm2, %xmm3 ## encoding: [0xc5,0xf9,0x28,0xda]
-; X86-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd9]
-; X86-NEXT:    ## xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovapd %xmm2, %xmm4 ## encoding: [0xc5,0xf9,0x28,0xe2]
-; X86-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0xfd,0x78,0xbf,0xe1]
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xe1]
+; X86-NEXT:    ## xmm4 {%k1} = -(xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd9,0x58,0xdb]
 ; X86-NEXT:    vfnmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0xbf,0xd1]
 ; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10418,14 +10542,14 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
 ; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda]
+; X86-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xd9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovaps %xmm2, %xmm3 ## encoding: [0xc5,0xf8,0x28,0xda]
-; X86-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd9]
-; X86-NEXT:    ## xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovaps %xmm2, %xmm4 ## encoding: [0xc5,0xf8,0x28,0xe2]
-; X86-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 ## encoding: [0x62,0xf2,0x7d,0x78,0xbf,0xe1]
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3 ## encoding: [0xc5,0xe0,0x58,0xdc]
+; X86-NEXT:    vfnmsub231ss %xmm1, %xmm0, %xmm4 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xe1]
+; X86-NEXT:    ## xmm4 {%k1} = -(xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3 ## encoding: [0xc5,0xd8,0x58,0xdb]
 ; X86-NEXT:    vfnmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x59,0xbf,0xd1]
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0xc5,0xe8,0x58,0xc3]
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10453,9 +10577,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd231ss (%eax), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb9,0x08]
 ; X86-NEXT:    ## xmm1 {%k1} = (xmm0 * mem) + xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x28,0xc1]
@@ -10477,9 +10601,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
 ; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd132ss (%eax), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x99,0x00]
 ; X86-NEXT:    ## xmm0 {%k1} = (xmm0 * mem) + xmm1
 ; X86-NEXT:    retl ## encoding: [0xc3]
@@ -10968,10 +11092,10 @@ declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i3
 define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, ptr %p) {
 ; X86-LABEL: test_cmp_512:
 ; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    vcmpltps %zmm3, %zmm2, %k0 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xc3,0x01]
+; X86-NEXT:    vcmpltps {sae}, %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc9,0x01]
+; X86-NEXT:    kxnorw %k0, %k1, %k1 ## encoding: [0xc5,0xf4,0x46,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vcmpltps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x01]
-; X86-NEXT:    vcmpltps %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6c,0x48,0xc2,0xcb,0x01]
-; X86-NEXT:    kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9]
 ; X86-NEXT:    vmovaps (%eax), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x00]
 ; X86-NEXT:    retl ## encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index b979f7531cd36..aba798608a07a 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -800,9 +800,9 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovaps %xmm2, %xmm3
-; X86-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
-; X86-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; X86-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm3 {%k1}
+; X86-NEXT:    vsqrtss %xmm1, %xmm0, %xmm2 {%k1}
+; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm2
 ; X86-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; X86-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0
@@ -840,9 +840,9 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovapd %xmm2, %xmm3
-; X86-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
-; X86-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; X86-NEXT:    vaddpd %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm3 {%k1}
+; X86-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm2 {%k1}
+; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm2
 ; X86-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; X86-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
@@ -860,12 +860,19 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 }
 
 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttsd2usi %xmm0, %ecx
-; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvttsd2usi:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsd2usi %xmm0, %ecx
+; X64-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvttsd2usi:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsd2usi {sae}, %xmm0, %ecx
+; X86-NEXT:    vcvttsd2usi %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
   %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
   %res2 = add i32 %res0, %res1
@@ -874,12 +881,19 @@ define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvttsd2si:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttsd2si %xmm0, %ecx
-; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvttsd2si:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsd2si %xmm0, %ecx
+; X64-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvttsd2si:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsd2si {sae}, %xmm0, %ecx
+; X86-NEXT:    vcvttsd2si %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
   %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
   %res2 = add i32 %res0, %res1
@@ -888,12 +902,19 @@ define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvttss2si:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
-; CHECK-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvttss2si:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
+; X64-NEXT:    vcvttss2si %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvttss2si:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttss2si %xmm0, %ecx
+; X86-NEXT:    vcvttss2si {sae}, %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
   %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
   %res2 = add i32 %res0, %res1
@@ -918,12 +939,19 @@ define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) {
 }
 
 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvttss2usi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
-; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvttss2usi:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
+; X64-NEXT:    vcvttss2usi %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvttss2usi:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttss2usi %xmm0, %ecx
+; X86-NEXT:    vcvttss2usi {sae}, %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
   %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
   %res2 = add i32 %res0, %res1
@@ -932,14 +960,23 @@ define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvtsd2usi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsd2usi %xmm0, %eax
+; X64-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
+; X64-NEXT:    addl %eax, %ecx
+; X64-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvtsd2usi32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %eax
+; X86-NEXT:    vcvtsd2usi %xmm0, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
 
   %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
   %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
@@ -951,14 +988,23 @@ define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
 declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvtsd2si32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsd2si %xmm0, %eax
+; X64-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
+; X64-NEXT:    addl %eax, %ecx
+; X64-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvtsd2si32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %eax
+; X86-NEXT:    vcvtsd2si %xmm0, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
 
   %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
   %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
@@ -970,14 +1016,23 @@ define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
 declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %eax
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvtss2usi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtss2usi %xmm0, %eax
+; X64-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
+; X64-NEXT:    addl %eax, %ecx
+; X64-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvtss2usi32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %eax
+; X86-NEXT:    vcvtss2usi %xmm0, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
 
   %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
   %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
@@ -989,14 +1044,23 @@ define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 
 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx512_cvtss2si32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %eax
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X64-LABEL: test_x86_avx512_cvtss2si32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtss2si %xmm0, %eax
+; X64-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
+; X64-NEXT:    addl %eax, %ecx
+; X64-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_x86_avx512_cvtss2si32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtss2si {rz-sae}, %xmm0, %eax
+; X86-NEXT:    vcvtss2si %xmm0, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
 
   %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
   %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
@@ -1021,12 +1085,11 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
 ; X86-LABEL: test_x86_vcvtps2ph_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtps2ph $2, %zmm0, (%eax)
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
 ; X86-NEXT:    vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
-; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
-; X86-NEXT:    vcvtps2ph $2, %zmm0, (%eax)
-; X86-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-NEXT:    vcvtps2ph $3, {sae}, %zmm0, %ymm0 {%k1} {z}
+; X86-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
@@ -1098,9 +1161,9 @@ define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) {
 ;
 ; X86-LABEL: test_mask_store_ss:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vmovss %xmm0, (%eax) {%k1}
 ; X86-NEXT:    retl
   %1 = and i8 %mask, 1
@@ -3085,9 +3148,9 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ;
 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
   %x2s = load double, ptr %x2ptr
@@ -3230,8 +3293,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vpmovqb %zmm0, %xmm2
 ; X86-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovqb %zmm0, %xmm2
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovqb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -3258,10 +3321,10 @@ define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovqb %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovqb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3288,8 +3351,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vpmovsqb %zmm0, %xmm2
 ; X86-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovsqb %zmm0, %xmm2
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovsqb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -3316,10 +3379,10 @@ define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovsqb %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovsqb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3346,8 +3409,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vpmovusqb %zmm0, %xmm2
 ; X86-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovusqb %zmm0, %xmm2
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovusqb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -3374,10 +3437,10 @@ define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovusqb %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovusqb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3404,8 +3467,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vpmovqw %zmm0, %xmm2
 ; X86-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovqw %zmm0, %xmm2
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
@@ -3432,10 +3495,10 @@ define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovqw %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovqw %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3462,8 +3525,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vpmovsqw %zmm0, %xmm2
 ; X86-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovsqw %zmm0, %xmm2
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovsqw %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
@@ -3490,10 +3553,10 @@ define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovsqw %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovsqw %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3520,8 +3583,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vpmovusqw %zmm0, %xmm2
 ; X86-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovusqw %zmm0, %xmm2
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovusqw %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
@@ -3548,10 +3611,10 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovusqw %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovusqw %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3622,10 +3685,10 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovqd %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovqd %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3694,10 +3757,10 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovsqd %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovsqd %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3766,10 +3829,10 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovusqd %zmm0, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vpmovusqd %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3795,8 +3858,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8>
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpmovdb %zmm0, %xmm2
 ; X86-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovdb %zmm0, %xmm2
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -3823,9 +3886,9 @@ define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovdb %zmm0, (%eax)
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovdb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3851,8 +3914,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8>
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpmovsdb %zmm0, %xmm2
 ; X86-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovsdb %zmm0, %xmm2
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovsdb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -3879,9 +3942,9 @@ define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovsdb %zmm0, (%eax)
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovsdb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3907,8 +3970,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpmovusdb %zmm0, %xmm2
 ; X86-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpmovusdb %zmm0, %xmm2
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovusdb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -3935,9 +3998,9 @@ define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovusdb %zmm0, (%eax)
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovusdb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -3962,8 +4025,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpmovdw %zmm0, %ymm2
 ; X86-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
+; X86-NEXT:    vpmovdw %zmm0, %ymm2
 ; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vpmovdw %zmm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -3989,9 +4052,9 @@ define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovdw %zmm0, (%eax)
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovdw %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4016,8 +4079,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i1
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpmovsdw %zmm0, %ymm2
 ; X86-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
+; X86-NEXT:    vpmovsdw %zmm0, %ymm2
 ; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vpmovsdw %zmm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -4043,9 +4106,9 @@ define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovsdw %zmm0, (%eax)
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovsdw %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4070,8 +4133,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpmovusdw %zmm0, %ymm2
 ; X86-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
+; X86-NEXT:    vpmovusdw %zmm0, %ymm2
 ; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vpmovusdw %zmm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -4097,9 +4160,9 @@ define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovusdw %zmm0, (%eax)
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovusdw %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4432,9 +4495,9 @@ define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x f
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovaps %xmm2, %xmm3
-; X86-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
-; X86-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; X86-NEXT:    vaddps %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1}
+; X86-NEXT:    vgetexpss %xmm1, %xmm0, %xmm2 {%k1}
+; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
   %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
@@ -4485,9 +4548,9 @@ define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovapd %xmm2, %xmm3
-; X86-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
-; X86-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; X86-NEXT:    vaddpd %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm3 {%k1}
+; X86-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm2 {%k1}
+; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
   %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
@@ -4556,25 +4619,20 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vcmplesd %xmm1, %xmm0, %k0
+; X86-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
+; X86-NEXT:    kmovw %k0, %eax
+; X86-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
-; X86-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %esi
-; X86-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
+; X86-NEXT:    vcmplesd %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
@@ -4633,25 +4691,20 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vcmpless %xmm1, %xmm0, %k0
+; X86-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
+; X86-NEXT:    kmovw %k0, %eax
+; X86-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %ecx
+; X86-NEXT:    andl %eax, %ecx
 ; X86-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
-; X86-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %esi
-; X86-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
+; X86-NEXT:    vcmpless %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %esi, %eax
 ; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
   %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
@@ -4734,13 +4787,13 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovapd %xmm2, %xmm3
-; X86-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
-; X86-NEXT:    vgetmantsd $12, %xmm1, %xmm0, %xmm4 {%k1} {z}
+; X86-NEXT:    vgetmantsd $13, {sae}, %xmm1, %xmm0, %xmm3 {%k1}
+; X86-NEXT:    vgetmantsd $14, %xmm1, %xmm0, %xmm4
 ; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
-; X86-NEXT:    vgetmantsd $13, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; X86-NEXT:    vgetmantsd $14, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm2 {%k1}
+; X86-NEXT:    vgetmantsd $12, %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
-; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4)
@@ -4774,9 +4827,9 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo
 ; X86-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vgetmantss $12, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm2
-; X86-NEXT:    vgetmantss $13, {sae}, %xmm1, %xmm0, %xmm3
-; X86-NEXT:    vgetmantss $14, %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vgetmantss $14, %xmm1, %xmm0, %xmm3
+; X86-NEXT:    vgetmantss $13, {sae}, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vaddps %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    vaddps %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    retl
   %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
@@ -5412,11 +5465,11 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovapd %zmm0, %zmm3
-; X86-NEXT:    vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
-; X86-NEXT:    vaddpd %zmm4, %zmm3, %zmm3
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; X86-NEXT:    vfixupimmpd $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; X86-NEXT:    vmovapd %zmm0, %zmm4
+; X86-NEXT:    vfixupimmpd $4, %zmm2, %zmm1, %zmm4 {%k1}
+; X86-NEXT:    vaddpd %zmm3, %zmm4, %zmm3
 ; X86-NEXT:    vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
 ; X86-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
 ; X86-NEXT:    retl
@@ -5464,12 +5517,12 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; X86-NEXT:    vmovapd %zmm0, %zmm4
+; X86-NEXT:    vfixupimmpd $5, %zmm3, %zmm1, %zmm4 {%k1} {z}
 ; X86-NEXT:    vmovapd %zmm0, %zmm3
 ; X86-NEXT:    vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovapd %zmm0, %zmm5
-; X86-NEXT:    vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
-; X86-NEXT:    vaddpd %zmm5, %zmm3, %zmm3
+; X86-NEXT:    vaddpd %zmm4, %zmm3, %zmm3
 ; X86-NEXT:    vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
 ; X86-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
 ; X86-NEXT:    retl
@@ -5501,12 +5554,12 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; X86-NEXT:    vmovaps %xmm0, %xmm4
+; X86-NEXT:    vfixupimmss $5, %xmm3, %xmm1, %xmm4 {%k1}
 ; X86-NEXT:    vmovaps %xmm0, %xmm3
 ; X86-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovaps %xmm0, %xmm5
-; X86-NEXT:    vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
-; X86-NEXT:    vaddps %xmm5, %xmm3, %xmm3
+; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3
 ; X86-NEXT:    vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; X86-NEXT:    retl
@@ -5547,12 +5600,12 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x f
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; X86-NEXT:    vmovaps %xmm0, %xmm4
+; X86-NEXT:    vfixupimmss $5, {sae}, %xmm3, %xmm1, %xmm4 {%k1} {z}
 ; X86-NEXT:    vmovaps %xmm0, %xmm3
 ; X86-NEXT:    vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovaps %xmm0, %xmm5
-; X86-NEXT:    vfixupimmss $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
-; X86-NEXT:    vaddps %xmm5, %xmm3, %xmm3
+; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3
 ; X86-NEXT:    vfixupimmss $6, %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0
 ; X86-NEXT:    retl
@@ -5592,13 +5645,13 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovaps %zmm0, %zmm4
+; X86-NEXT:    vfixupimmps $5, %zmm3, %zmm1, %zmm4 {%k1}
 ; X86-NEXT:    vmovaps %zmm0, %zmm3
 ; X86-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovaps %zmm0, %zmm5
-; X86-NEXT:    vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
-; X86-NEXT:    vaddps %zmm5, %zmm3, %zmm3
+; X86-NEXT:    vaddps %zmm4, %zmm3, %zmm3
 ; X86-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
 ; X86-NEXT:    vaddps %zmm0, %zmm3, %zmm0
 ; X86-NEXT:    retl
@@ -5644,13 +5697,13 @@ define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0,
 ;
 ; X86-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovaps %zmm0, %zmm4
+; X86-NEXT:    vfixupimmps $6, {sae}, %zmm3, %zmm1, %zmm4 {%k1} {z}
 ; X86-NEXT:    vmovaps %zmm0, %zmm3
 ; X86-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovaps %zmm0, %zmm5
-; X86-NEXT:    vfixupimmps $6, {sae}, %zmm4, %zmm1, %zmm5 {%k1} {z}
-; X86-NEXT:    vaddps %zmm5, %zmm3, %zmm3
+; X86-NEXT:    vaddps %zmm4, %zmm3, %zmm3
 ; X86-NEXT:    vfixupimmps $7, %zmm2, %zmm1, %zmm0
 ; X86-NEXT:    vaddps %zmm0, %zmm3, %zmm0
 ; X86-NEXT:    retl
@@ -5682,12 +5735,12 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; X86-NEXT:    vmovapd %xmm0, %xmm4
+; X86-NEXT:    vfixupimmsd $5, {sae}, %xmm3, %xmm1, %xmm4 {%k1}
 ; X86-NEXT:    vmovapd %xmm0, %xmm3
 ; X86-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovapd %xmm0, %xmm5
-; X86-NEXT:    vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1}
-; X86-NEXT:    vaddpd %xmm5, %xmm3, %xmm3
+; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
 ; X86-NEXT:    vfixupimmsd $6, %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; X86-NEXT:    retl
@@ -5728,12 +5781,12 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; X86-NEXT:    vmovapd %xmm0, %xmm4
+; X86-NEXT:    vfixupimmsd $5, {sae}, %xmm3, %xmm1, %xmm4 {%k1} {z}
 ; X86-NEXT:    vmovapd %xmm0, %xmm3
 ; X86-NEXT:    vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
-; X86-NEXT:    vmovapd %xmm0, %xmm5
-; X86-NEXT:    vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
-; X86-NEXT:    vaddpd %xmm5, %xmm3, %xmm3
+; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
 ; X86-NEXT:    vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
 ; X86-NEXT:    retl
@@ -5773,13 +5826,13 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x d
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovapd %xmm0, %xmm3
+; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovapd %xmm0, %xmm3
-; X86-NEXT:    vfmadd213sd {{.*#+}} xmm3 {%k1} = (xmm1 * xmm3) + xmm2
 ; X86-NEXT:    vmovapd %xmm0, %xmm4
-; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfmadd213sd {{.*#+}} xmm4 {%k1} = (xmm1 * xmm4) + xmm2
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfmadd213sd {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1}
 ; X86-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    retl
@@ -5824,13 +5877,13 @@ define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x flo
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %xmm0, %xmm3
+; X86-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovaps %xmm0, %xmm3
-; X86-NEXT:    vfmadd213ss {{.*#+}} xmm3 {%k1} = (xmm1 * xmm3) + xmm2
 ; X86-NEXT:    vmovaps %xmm0, %xmm4
-; X86-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfmadd213ss {{.*#+}} xmm4 {%k1} = (xmm1 * xmm4) + xmm2
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfmadd213ss {ru-sae}, %xmm2, %xmm1, %xmm0 {%k1}
 ; X86-NEXT:    vaddps %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    retl
@@ -5875,9 +5928,9 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovapd %xmm0, %xmm3
-; X86-NEXT:    vfmadd213sd {{.*#+}} xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
-; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
+; X86-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %1 = extractelement <2 x double> %x0, i64 0
   %2 = extractelement <2 x double> %x1, i64 0
@@ -5917,9 +5970,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovaps %xmm0, %xmm3
-; X86-NEXT:    vfmadd213ss {{.*#+}} xmm3 {%k1} {z} = (xmm1 * xmm3) + xmm2
-; X86-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
-; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; X86-NEXT:    vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
+; X86-NEXT:    vaddps %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %1 = extractelement <4 x float> %x0, i64 0
   %2 = extractelement <4 x float> %x1, i64 0
@@ -5953,11 +6006,11 @@ define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr
 ;
 ; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_load0:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %xmm0
 ; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps (%eax), %xmm0
 ; X86-NEXT:    vfmadd132ss {{.*#+}} xmm0 {%k1} {z} = (xmm0 * mem) + xmm1
 ; X86-NEXT:    retl
   %5 = load <4 x float>, ptr %1, align 16
@@ -5985,13 +6038,13 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovapd %xmm2, %xmm3
+; X86-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovapd %xmm2, %xmm3
-; X86-NEXT:    vfmadd231sd {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) + xmm3
 ; X86-NEXT:    vmovapd %xmm2, %xmm4
-; X86-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfmadd231sd {{.*#+}} xmm4 {%k1} = (xmm0 * xmm1) + xmm4
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfmadd231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
@@ -6036,13 +6089,13 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x fl
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %xmm2, %xmm3
+; X86-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovaps %xmm2, %xmm3
-; X86-NEXT:    vfmadd231ss {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) + xmm3
 ; X86-NEXT:    vmovaps %xmm2, %xmm4
-; X86-NEXT:    vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfmadd231ss {{.*#+}} xmm4 {%k1} = (xmm0 * xmm1) + xmm4
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfmadd231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
@@ -6086,14 +6139,14 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_ss_mask_memfold:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; X86-NEXT:    vmovss %xmm0, (%edx)
+; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a.val = load float, ptr %a
   %av0 = insertelement <4 x float> undef, float %a.val, i32 0
@@ -6132,13 +6185,13 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_ss_maskz_memfold:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
-; X86-NEXT:    vmovss %xmm0, (%edx)
+; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a.val = load float, ptr %a
   %av0 = insertelement <4 x float> undef, float %a.val, i32 0
@@ -6178,14 +6231,14 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_sd_mask_memfold:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm0
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; X86-NEXT:    vmovsd %xmm0, (%edx)
+; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a.val = load double, ptr %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -6220,13 +6273,13 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_sd_maskz_memfold:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0
-; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vmovsd %xmm0, %xmm0, %xmm0 {%k1} {z}
-; X86-NEXT:    vmovsd %xmm0, (%edx)
+; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a.val = load double, ptr %a
   %av0 = insertelement <2 x double> undef, double %a.val, i32 0
@@ -6263,13 +6316,13 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovapd %xmm2, %xmm3
+; X86-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovapd %xmm2, %xmm3
-; X86-NEXT:    vfmsub231sd {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovapd %xmm2, %xmm4
-; X86-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfmsub231sd {{.*#+}} xmm4 {%k1} = (xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
@@ -6320,13 +6373,13 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x fl
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %xmm2, %xmm3
+; X86-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovaps %xmm2, %xmm3
-; X86-NEXT:    vfmsub231ss {{.*#+}} xmm3 {%k1} = (xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovaps %xmm2, %xmm4
-; X86-NEXT:    vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfmsub231ss {{.*#+}} xmm4 {%k1} = (xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
@@ -6377,13 +6430,13 @@ define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovapd %xmm2, %xmm3
+; X86-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovapd %xmm2, %xmm3
-; X86-NEXT:    vfnmsub231sd {{.*#+}} xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovapd %xmm2, %xmm4
-; X86-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfnmsub231sd {{.*#+}} xmm4 {%k1} = -(xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfnmsub231sd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vaddpd %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
@@ -6437,13 +6490,13 @@ define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x f
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
 ; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %xmm2, %xmm3
+; X86-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm3
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vmovaps %xmm2, %xmm3
-; X86-NEXT:    vfnmsub231ss {{.*#+}} xmm3 {%k1} = -(xmm0 * xmm1) - xmm3
 ; X86-NEXT:    vmovaps %xmm2, %xmm4
-; X86-NEXT:    vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4
-; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3
+; X86-NEXT:    vfnmsub231ss {{.*#+}} xmm4 {%k1} = -(xmm0 * xmm1) - xmm4
+; X86-NEXT:    vaddps %xmm3, %xmm4, %xmm3
 ; X86-NEXT:    vfnmsub231ss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; X86-NEXT:    vaddps %xmm3, %xmm2, %xmm0
 ; X86-NEXT:    retl
@@ -6492,9 +6545,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
 ;
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vfmadd231ss {{.*#+}} xmm1 {%k1} = (xmm0 * mem) + xmm1
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-NEXT:    retl
@@ -6520,9 +6573,9 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vfmadd132ss {{.*#+}} xmm0 {%k1} = (xmm0 * mem) + xmm1
 ; X86-NEXT:    retl
   %q = load float, ptr %ptr_b
@@ -7154,11 +7207,11 @@ define <16 x i32> @test_x86_avx512_psllv_d_512_const() {
 ;
 ; X86-LABEL: test_x86_avx512_psllv_d_512_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
+; X86-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
 ; X86-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
-; X86-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
+; X86-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
 ; X86-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
-; X86-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    retl
   %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
   %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1>)
@@ -7227,11 +7280,11 @@ define <8 x i64> @test_x86_avx512_psllv_q_512_const() {
 ;
 ; X86-LABEL: test_x86_avx512_psllv_q_512_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
+; X86-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [4,4,4,4,4,4,4,18446744073709551615]
 ; X86-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
-; X86-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
+; X86-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
 ; X86-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
-; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    retl
   %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
   %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1,  i64 1, i64 1, i64 1, i64 -1>)
@@ -7402,11 +7455,11 @@ define <16 x i32> @test_x86_avx512_psrlv_d_512_const() {
 ;
 ; X86-LABEL: test_x86_avx512_psrlv_d_512_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
+; X86-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
 ; X86-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
-; X86-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295]
+; X86-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0]
 ; X86-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
-; X86-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    retl
   %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
   %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1  >)
@@ -7475,11 +7528,11 @@ define <8 x i64> @test_x86_avx512_psrlv_q_512_const() {
 ;
 ; X86-LABEL: test_x86_avx512_psrlv_q_512_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
+; X86-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [4,4,4,4,4,4,4,18446744073709551615]
 ; X86-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
-; X86-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615]
+; X86-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0]
 ; X86-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm1
-; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; X86-NEXT:    retl
   %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1,  i64 1, i64 1, i64 1, i64 -1>)
@@ -7614,12 +7667,11 @@ define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    vmovaps 72(%ebp), %zmm3
 ; X86-NEXT:    vcmplt_oqpd %zmm1, %zmm0, %k0
 ; X86-NEXT:    vcmplt_oqpd 8(%ebp), %zmm2, %k1
 ; X86-NEXT:    kunpckbw %k0, %k1, %k1
-; X86-NEXT:    vmovaps 136(%ebp), %zmm3 {%k1}
-; X86-NEXT:    vmovaps %zmm3, %zmm0
+; X86-NEXT:    vmovaps 72(%ebp), %zmm0
+; X86-NEXT:    vmovaps 136(%ebp), %zmm0 {%k1}
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
@@ -7656,10 +7708,9 @@ define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    vmovaps 72(%ebp), %zmm2
 ; X86-NEXT:    vcmplt_oqpd %zmm1, %zmm0, %k1
-; X86-NEXT:    vmovaps 136(%ebp), %zmm2 {%k1}
-; X86-NEXT:    vmovaps %zmm2, %zmm0
+; X86-NEXT:    vmovaps 72(%ebp), %zmm0
+; X86-NEXT:    vmovaps 136(%ebp), %zmm0 {%k1}
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/avx512-load-store.ll b/llvm/test/CodeGen/X86/avx512-load-store.ll
index ce6bfa90d88a7..144b04cca32cd 100644
--- a/llvm/test/CodeGen/X86/avx512-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx512-load-store.ll
@@ -103,9 +103,9 @@ define void @test_mm_mask_store_ss(ptr %__W, i8 zeroext %__U, <4 x float> %__A)
 ;
 ; CHECK32-LABEL: test_mm_mask_store_ss:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovss %xmm0, (%eax) {%k1}
 ; CHECK32-NEXT:    retl
 entry:
@@ -126,9 +126,9 @@ define void @test_mm_mask_store_sd(ptr %__W, i8 zeroext %__U, <2 x double> %__A)
 ;
 ; CHECK32-LABEL: test_mm_mask_store_sd:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovsd %xmm0, (%eax) {%k1}
 ; CHECK32-NEXT:    retl
 entry:
@@ -148,9 +148,9 @@ define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, ptr
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -173,9 +173,9 @@ define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, pt
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -197,9 +197,9 @@ define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, ptr %__W) local_unnam
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -220,9 +220,9 @@ define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, ptr %__W) local_unna
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -244,9 +244,9 @@ define void @test_mm_mask_store_ss_2(ptr %__P, i8 zeroext %__U, <4 x float> %__A
 ;
 ; CHECK32-LABEL: test_mm_mask_store_ss_2:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovss %xmm0, (%eax) {%k1}
 ; CHECK32-NEXT:    retl
 entry:
@@ -266,9 +266,9 @@ define void @test_mm_mask_store_sd_2(ptr %__P, i8 zeroext %__U, <2 x double> %__
 ;
 ; CHECK32-LABEL: test_mm_mask_store_sd_2:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovsd %xmm0, (%eax) {%k1}
 ; CHECK32-NEXT:    retl
 entry:
@@ -288,9 +288,9 @@ define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, pt
 ;
 ; CHECK32-LABEL: test_mm_mask_load_ss_2:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -311,9 +311,9 @@ define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, ptr readonly %__W)
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_ss_2:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -333,9 +333,9 @@ define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U,
 ;
 ; CHECK32-LABEL: test_mm_mask_load_sd_2:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
@@ -356,9 +356,9 @@ define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, ptr readonly %__W)
 ;
 ; CHECK32-LABEL: test_mm_maskz_load_sd_2:
 ; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    kmovw %ecx, %k1
 ; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero
 ; CHECK32-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index da0cef0e4e99b..5212916810ef8 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -208,10 +208,10 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
 ; X86-LABEL: mand16_mem:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovw (%ecx), %k0
+; X86-NEXT:    kmovw (%eax), %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw (%eax), %k1
-; X86-NEXT:    korw %k1, %k0, %k0
+; X86-NEXT:    korw %k0, %k1, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
@@ -1122,10 +1122,10 @@ define <16 x i1> @test15(i32 %x, i32 %y)  {
 ;
 ; X86-LABEL: test15:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $21845, %eax ## imm = 0x5555
 ; X86-NEXT:    movl $1, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovgl %eax, %ecx
 ; X86-NEXT:    kmovd %ecx, %k0
 ; X86-NEXT:    vpmovm2b %k0, %xmm0
@@ -1353,14 +1353,14 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; X86-LABEL: test17:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setg %al
 ; X86-NEXT:    kshiftrq $6, %k0, %k1
 ; X86-NEXT:    kshiftlq $6, %k1, %k1
 ; X86-NEXT:    kshiftlq $59, %k0, %k0
 ; X86-NEXT:    kshiftrq $59, %k0, %k0
 ; X86-NEXT:    korq %k1, %k0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setg %al
 ; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    kshiftlq $63, %k1, %k1
 ; X86-NEXT:    kshiftrq $58, %k1, %k1
@@ -1457,18 +1457,18 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ;
 ; X86-LABEL: test18:
 ; X86:       ## %bb.0:
-; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    kshiftrw $8, %k1, %k2
-; X86-NEXT:    kshiftrw $9, %k1, %k1
 ; X86-NEXT:    movb $-65, %al
-; X86-NEXT:    kmovd %eax, %k3
-; X86-NEXT:    kandb %k3, %k0, %k0
-; X86-NEXT:    kshiftlb $6, %k1, %k1
-; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    kshiftrw $9, %k1, %k2
+; X86-NEXT:    kshiftlb $6, %k2, %k2
+; X86-NEXT:    korb %k2, %k0, %k0
 ; X86-NEXT:    kshiftlb $1, %k0, %k0
 ; X86-NEXT:    kshiftrb $1, %k0, %k0
-; X86-NEXT:    kshiftlb $7, %k2, %k1
+; X86-NEXT:    kshiftrw $8, %k1, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
 ; X86-NEXT:    korb %k1, %k0, %k0
 ; X86-NEXT:    vpmovm2w %k0, %xmm0
 ; X86-NEXT:    retl
@@ -1675,10 +1675,10 @@ define void @store_v1i1(<1 x i1> %c , ptr %ptr) {
 ; X86-LABEL: store_v1i1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
 ; X86-NEXT:    kshiftlb $7, %k0, %k0
 ; X86-NEXT:    kshiftrb $7, %k0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <1 x i1> %c, <i1 1>
@@ -1734,10 +1734,10 @@ define void @store_v2i1(<2 x i1> %c , ptr %ptr) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; X86-NEXT:    vpmovq2m %xmm0, %k0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
 ; X86-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-NEXT:    kshiftrb $6, %k0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <2 x i1> %c, <i1 1, i1 1>
@@ -1793,10 +1793,10 @@ define void @store_v4i1(<4 x i1> %c , ptr %ptr) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    vpmovd2m %xmm0, %k0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
 ; X86-NEXT:    kshiftlb $4, %k0, %k0
 ; X86-NEXT:    kshiftrb $4, %k0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
@@ -1847,8 +1847,8 @@ define void @store_v8i1(<8 x i1> %c , ptr %ptr) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; X86-NEXT:    vpmovw2m %xmm0, %k0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotb %k0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
@@ -1897,8 +1897,8 @@ define void @store_v16i1(<16 x i1> %c , ptr %ptr) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
 ; X86-NEXT:    vpmovb2m %xmm0, %k0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    knotw %k0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %k0, (%eax)
 ; X86-NEXT:    retl
   %x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
@@ -1958,10 +1958,10 @@ define void @store_i16_i1(i16 %x, ptr%y) {
 ;
 ; X86-LABEL: store_i16_i1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    retl
   %c = trunc i16 %x to i1
   store i1 %c, ptr %y
@@ -1977,10 +1977,10 @@ define void @store_i8_i1(i8 %x, ptr%y) {
 ;
 ; X86-LABEL: store_i8_i1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    retl
   %c = trunc i8 %x to i1
   store i1 %c, ptr %y
@@ -2320,9 +2320,9 @@ define void @ktest_2(<32 x float> %in, ptr %base) {
 ; X86-NEXT:    vcmpgtps (%eax), %zmm0, %k1
 ; X86-NEXT:    vcmpgtps 64(%eax), %zmm1, %k2
 ; X86-NEXT:    kunpckwd %k1, %k2, %k0
+; X86-NEXT:    vmovups 4(%eax), %zmm2 {%k1} {z}
+; X86-NEXT:    vcmpltps %zmm2, %zmm0, %k1
 ; X86-NEXT:    vmovups 68(%eax), %zmm2 {%k2} {z}
-; X86-NEXT:    vmovups 4(%eax), %zmm3 {%k1} {z}
-; X86-NEXT:    vcmpltps %zmm3, %zmm0, %k1
 ; X86-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; X86-NEXT:    kunpckwd %k1, %k2, %k1
 ; X86-NEXT:    kortestd %k1, %k0
@@ -2720,9 +2720,9 @@ define void @store_8i1_1(ptr %a, <8 x i16> %v) {
 ;
 ; X86-LABEL: store_8i1_1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; X86-NEXT:    vpmovw2m %xmm0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovb %k0, (%eax)
 ; X86-NEXT:    retl
   %v1 = trunc <8 x i16> %v to <8 x i1>
@@ -2879,9 +2879,9 @@ define void @store_32i1_1(ptr %a, <32 x i16> %v) {
 ;
 ; X86-LABEL: store_32i1_1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsllw $15, %zmm0, %zmm0
 ; X86-NEXT:    vpmovw2m %zmm0, %k0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovd %k0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4239,8 +4239,8 @@ define void @store_v128i1_constant(ptr %R) {
 ;
 ; X86-LABEL: store_v128i1_constant:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps {{.*#+}} xmm0 = [4294963197,3758096251,4294959101,3221225403]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 entry:
@@ -4310,9 +4310,9 @@ define void @mask_not_cast(ptr, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>) {
 ;
 ; X86-LABEL: mask_not_cast:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpcmpnleud %zmm3, %zmm2, %k1
 ; X86-NEXT:    vptestmd %zmm0, %zmm1, %k1 {%k1}
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu32 %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4436,13 +4436,13 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ;
 ; X86-LABEL: ktest_3:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vptestnmd %ymm0, %ymm0, %k0
-; X86-NEXT:    vptestnmd %ymm1, %ymm1, %k1
-; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    vptestnmd %ymm3, %ymm3, %k0
 ; X86-NEXT:    vptestnmd %ymm2, %ymm2, %k1
-; X86-NEXT:    vptestnmd %ymm3, %ymm3, %k2
-; X86-NEXT:    korb %k2, %k1, %k1
-; X86-NEXT:    ktestb %k1, %k0
+; X86-NEXT:    korb %k0, %k1, %k0
+; X86-NEXT:    vptestnmd %ymm1, %ymm1, %k1
+; X86-NEXT:    vptestnmd %ymm0, %ymm0, %k2
+; X86-NEXT:    korb %k1, %k2, %k1
+; X86-NEXT:    ktestb %k0, %k1
 ; X86-NEXT:    je LBB74_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    vzeroupper
@@ -4564,13 +4564,13 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ;
 ; X86-LABEL: ktest_4:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vptestnmq %zmm0, %zmm0, %k0
-; X86-NEXT:    vptestnmq %zmm1, %zmm1, %k1
-; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    vptestnmq %zmm3, %zmm3, %k0
 ; X86-NEXT:    vptestnmq %zmm2, %zmm2, %k1
-; X86-NEXT:    vptestnmq %zmm3, %zmm3, %k2
-; X86-NEXT:    korb %k2, %k1, %k1
-; X86-NEXT:    ktestb %k1, %k0
+; X86-NEXT:    korb %k0, %k1, %k0
+; X86-NEXT:    vptestnmq %zmm1, %zmm1, %k1
+; X86-NEXT:    vptestnmq %zmm0, %zmm0, %k2
+; X86-NEXT:    korb %k1, %k2, %k1
+; X86-NEXT:    ktestb %k0, %k1
 ; X86-NEXT:    je LBB75_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    vzeroupper
@@ -4690,13 +4690,13 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 ;
 ; X86-LABEL: ktest_5:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
-; X86-NEXT:    vptestnmd %zmm1, %zmm1, %k1
-; X86-NEXT:    korw %k1, %k0, %k0
+; X86-NEXT:    vptestnmd %zmm3, %zmm3, %k0
 ; X86-NEXT:    vptestnmd %zmm2, %zmm2, %k1
-; X86-NEXT:    vptestnmd %zmm3, %zmm3, %k2
-; X86-NEXT:    korw %k2, %k1, %k1
-; X86-NEXT:    ktestw %k1, %k0
+; X86-NEXT:    korw %k0, %k1, %k0
+; X86-NEXT:    vptestnmd %zmm1, %zmm1, %k1
+; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k2
+; X86-NEXT:    korw %k1, %k2, %k1
+; X86-NEXT:    ktestw %k0, %k1
 ; X86-NEXT:    je LBB76_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    vzeroupper
@@ -4850,13 +4850,13 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ;
 ; X86-LABEL: ktest_6:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vptestnmw %zmm0, %zmm0, %k0
-; X86-NEXT:    vptestnmw %zmm1, %zmm1, %k1
-; X86-NEXT:    kord %k1, %k0, %k0
+; X86-NEXT:    vptestnmw %zmm3, %zmm3, %k0
 ; X86-NEXT:    vptestnmw %zmm2, %zmm2, %k1
-; X86-NEXT:    vptestnmw %zmm3, %zmm3, %k2
-; X86-NEXT:    kord %k2, %k1, %k1
-; X86-NEXT:    ktestd %k1, %k0
+; X86-NEXT:    kord %k0, %k1, %k0
+; X86-NEXT:    vptestnmw %zmm1, %zmm1, %k1
+; X86-NEXT:    vptestnmw %zmm0, %zmm0, %k2
+; X86-NEXT:    kord %k1, %k2, %k1
+; X86-NEXT:    ktestd %k0, %k1
 ; X86-NEXT:    je LBB77_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    vzeroupper
@@ -5006,13 +5006,13 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ;
 ; X86-LABEL: ktest_7:
 ; X86:       ## %bb.0:
-; X86-NEXT:    vptestnmb %zmm0, %zmm0, %k0
-; X86-NEXT:    vptestnmb %zmm1, %zmm1, %k1
-; X86-NEXT:    korq %k1, %k0, %k0
+; X86-NEXT:    vptestnmb %zmm3, %zmm3, %k0
 ; X86-NEXT:    vptestnmb %zmm2, %zmm2, %k1
-; X86-NEXT:    vptestnmb %zmm3, %zmm3, %k2
-; X86-NEXT:    korq %k2, %k1, %k1
-; X86-NEXT:    kandq %k1, %k0, %k0
+; X86-NEXT:    korq %k0, %k1, %k0
+; X86-NEXT:    vptestnmb %zmm1, %zmm1, %k1
+; X86-NEXT:    vptestnmb %zmm0, %zmm0, %k2
+; X86-NEXT:    korq %k1, %k2, %k1
+; X86-NEXT:    kandq %k0, %k1, %k0
 ; X86-NEXT:    kshiftrq $32, %k0, %k1
 ; X86-NEXT:    kortestd %k1, %k0
 ; X86-NEXT:    je LBB78_1
@@ -5104,17 +5104,17 @@ define <64 x i1> @mask64_insert(i32 %a) {
 ;
 ; X86-LABEL: mask64_insert:
 ; X86:       ## %bb.0:
-; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    movl $-131076, %eax ## imm = 0xFFFDFFFC
-; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kmovd %eax, %k0
 ; X86-NEXT:    movl $-131075, %eax ## imm = 0xFFFDFFFD
-; X86-NEXT:    kmovd %eax, %k2
-; X86-NEXT:    kunpckdq %k1, %k2, %k1
-; X86-NEXT:    kshiftrq $1, %k1, %k1
-; X86-NEXT:    kshiftlq $1, %k1, %k1
-; X86-NEXT:    kshiftlq $63, %k0, %k0
-; X86-NEXT:    kshiftrq $63, %k0, %k0
-; X86-NEXT:    korq %k0, %k1, %k0
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kunpckdq %k0, %k1, %k0
+; X86-NEXT:    kshiftrq $1, %k0, %k0
+; X86-NEXT:    kshiftlq $1, %k0, %k0
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    kshiftlq $63, %k1, %k1
+; X86-NEXT:    kshiftrq $63, %k1, %k1
+; X86-NEXT:    korq %k1, %k0, %k0
 ; X86-NEXT:    vpmovm2b %k0, %zmm0
 ; X86-NEXT:    retl
   %a_i = trunc i32 %a to i1
@@ -5242,7 +5242,7 @@ define <1 x i1> @usub_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    kandnw %k1, %k0, %k0
+; X86-NEXT:    kandnw %k0, %k1, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
@@ -5311,7 +5311,7 @@ define <1 x i1> @ssub_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    kandnw %k1, %k0, %k0
+; X86-NEXT:    kandnw %k0, %k1, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index 162f5efd78f6d..f80df0a2a963c 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -291,11 +291,11 @@ define dso_local x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1,
 ; X32-NEXT:    vmovups %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X32-NEXT:    vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT:    kmovd %eax, %k0
+; X32-NEXT:    vpmovm2b %k0, %zmm0
+; X32-NEXT:    kmovd %ecx, %k0
+; X32-NEXT:    vpmovm2b %k0, %zmm1
 ; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    vpmovm2b %k2, %zmm0
-; X32-NEXT:    vpmovm2b %k1, %zmm1
 ; X32-NEXT:    vpmovm2b %k0, %zmm2
 ; X32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X32-NEXT:    # kill: def $ymm1 killed $ymm1 killed $zmm1
@@ -522,11 +522,11 @@ define dso_local x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1,
 ; X32-NEXT:    vmovups %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X32-NEXT:    vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT:    kmovd %eax, %k0
+; X32-NEXT:    vpmovm2b %k0, %zmm0
+; X32-NEXT:    kmovd %ecx, %k0
+; X32-NEXT:    vpmovm2b %k0, %zmm1
 ; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    vpmovm2b %k2, %zmm0
-; X32-NEXT:    vpmovm2b %k1, %zmm1
 ; X32-NEXT:    vpmovm2b %k0, %zmm2
 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; X32-NEXT:    # kill: def $xmm1 killed $xmm1 killed $zmm1
@@ -752,11 +752,11 @@ define dso_local x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x
 ; X32-NEXT:    vmovups %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X32-NEXT:    vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT:    kmovd %eax, %k0
+; X32-NEXT:    vpmovm2w %k0, %zmm0
+; X32-NEXT:    kmovd %ecx, %k0
+; X32-NEXT:    vpmovm2w %k0, %zmm1
 ; X32-NEXT:    kmovd %edx, %k0
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    vpmovm2w %k2, %zmm0
-; X32-NEXT:    vpmovm2w %k1, %zmm1
 ; X32-NEXT:    vpmovm2w %k0, %zmm2
 ; X32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; X32-NEXT:    # kill: def $xmm1 killed $xmm1 killed $zmm1
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index a664cc7f17a5c..1880a7e2e898d 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -692,17 +692,17 @@ define dso_local x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i1> %x,
 ; X32-LABEL: test_CallargRet128Vector:
 ; X32:       # %bb.0:
 ; X32-NEXT:    subl $44, %esp
-; X32-NEXT:    vmovups %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32-NEXT:    vmovups %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
 ; X32-NEXT:    vmovdqa %xmm1, %xmm4
-; X32-NEXT:    vpslld $31, %xmm0, %xmm1
-; X32-NEXT:    vpmovd2m %xmm1, %k1
-; X32-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X32-NEXT:    vmovdqa %xmm4, %xmm1
-; X32-NEXT:    vmovdqa %xmm4, %xmm2
+; X32-NEXT:    vmovdqa %xmm0, %xmm5
+; X32-NEXT:    vmovdqa %xmm1, %xmm2
 ; X32-NEXT:    calll _test_argRet128Vector
-; X32-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
+; X32-NEXT:    vpslld $31, %xmm5, %xmm1
+; X32-NEXT:    vpmovd2m %xmm1, %k1
 ; X32-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}
-; X32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload
+; X32-NEXT:    vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
 ; X32-NEXT:    addl $44, %esp
 ; X32-NEXT:    retl
 ;
@@ -913,20 +913,18 @@ define dso_local x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i1> %x
 define dso_local x86_regcallcc <32 x float> @testf32_inp(<32 x float> %a, <32 x float> %b, <32 x float> %c) nounwind {
 ; X32-LABEL: testf32_inp:
 ; X32:       # %bb.0:
-; X32-NEXT:    subl $44, %esp
-; X32-NEXT:    vmovups %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32-NEXT:    subl $28, %esp
 ; X32-NEXT:    vmovups %xmm6, (%esp) # 16-byte Spill
-; X32-NEXT:    vaddps %zmm2, %zmm0, %zmm6
-; X32-NEXT:    vaddps %zmm3, %zmm1, %zmm7
-; X32-NEXT:    vmulps %zmm2, %zmm0, %zmm0
-; X32-NEXT:    vsubps %zmm0, %zmm6, %zmm0
-; X32-NEXT:    vmulps %zmm3, %zmm1, %zmm1
-; X32-NEXT:    vsubps %zmm1, %zmm7, %zmm1
+; X32-NEXT:    vmulps %zmm2, %zmm0, %zmm6
+; X32-NEXT:    vaddps %zmm2, %zmm0, %zmm0
+; X32-NEXT:    vsubps %zmm6, %zmm0, %zmm0
 ; X32-NEXT:    vaddps %zmm4, %zmm0, %zmm0
+; X32-NEXT:    vmulps %zmm3, %zmm1, %zmm2
+; X32-NEXT:    vaddps %zmm3, %zmm1, %zmm1
+; X32-NEXT:    vsubps %zmm2, %zmm1, %zmm1
 ; X32-NEXT:    vaddps %zmm5, %zmm1, %zmm1
 ; X32-NEXT:    vmovups (%esp), %xmm6 # 16-byte Reload
-; X32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
-; X32-NEXT:    addl $44, %esp
+; X32-NEXT:    addl $28, %esp
 ; X32-NEXT:    retl
 ;
 ; WIN64-LABEL: testf32_inp:
@@ -965,46 +963,46 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    subl $20, %esp
 ; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    leal (%edx,%edi), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %eax
-; X32-NEXT:    subl %edi, %eax
-; X32-NEXT:    movl %ebp, %edx
-; X32-NEXT:    subl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    imull %edx, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, %edx
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    imull %eax, %edx
-; X32-NEXT:    addl %ebx, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NEXT:    subl %ebx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    imull %edi, %eax
-; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    leal (%edx,%edi), %ebp
+; X32-NEXT:    imull %esi, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    imull %edx, %ebp
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    addl %eax, %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    imull %edx, %ecx
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    addl %ebp, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    imull %ecx, %ebx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    imull %esi, %eax
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    imull %ecx, %ebp
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %edx, %eax
+; X32-NEXT:    addl $20, %esp
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
@@ -1131,21 +1129,21 @@ define dso_local x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-64, %esp
 ; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vaddps %zmm3, %zmm1, %zmm1
 ; X32-NEXT:    vaddps %zmm2, %zmm0, %zmm0
 ; X32-NEXT:    vaddps %zmm0, %zmm4, %zmm0
-; X32-NEXT:    vaddps %zmm1, %zmm5, %zmm1
-; X32-NEXT:    vaddps %zmm1, %zmm7, %zmm1
 ; X32-NEXT:    vaddps %zmm0, %zmm6, %zmm0
 ; X32-NEXT:    vaddps 8(%ebp), %zmm0, %zmm0
-; X32-NEXT:    vaddps 72(%ebp), %zmm1, %zmm1
-; X32-NEXT:    vaddps 200(%ebp), %zmm1, %zmm1
 ; X32-NEXT:    vaddps 136(%ebp), %zmm0, %zmm0
 ; X32-NEXT:    vaddps 264(%ebp), %zmm0, %zmm0
-; X32-NEXT:    vaddps 328(%ebp), %zmm1, %zmm1
-; X32-NEXT:    vaddps 456(%ebp), %zmm1, %zmm1
 ; X32-NEXT:    vaddps 392(%ebp), %zmm0, %zmm0
 ; X32-NEXT:    vaddps 520(%ebp), %zmm0, %zmm0
+; X32-NEXT:    vaddps %zmm3, %zmm1, %zmm1
+; X32-NEXT:    vaddps %zmm1, %zmm5, %zmm1
+; X32-NEXT:    vaddps %zmm1, %zmm7, %zmm1
+; X32-NEXT:    vaddps 72(%ebp), %zmm1, %zmm1
+; X32-NEXT:    vaddps 200(%ebp), %zmm1, %zmm1
+; X32-NEXT:    vaddps 328(%ebp), %zmm1, %zmm1
+; X32-NEXT:    vaddps 456(%ebp), %zmm1, %zmm1
 ; X32-NEXT:    vaddps 584(%ebp), %zmm1, %zmm1
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -1217,12 +1215,10 @@ define dso_local x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32
 define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signext, i32, i64, i16 signext, ptr) #0 {
 ; X32-LABEL: test_argRetMixTypes:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    vcvtsi2sd %eax, %xmm3, %xmm2
 ; X32-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; X32-NEXT:    vcvtsi2sd %eax, %xmm3, %xmm1
-; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; X32-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vcvtsi2sd %ecx, %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %edx, %xmm1
@@ -1231,10 +1227,10 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vcvtsi2sd %esi, %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vcvtsi2sdl (%ebx), %xmm3, %xmm1
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vcvtsi2sdl (%eax), %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vcvttsd2si %xmm0, %eax
-; X32-NEXT:    popl %ebx
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 721ffbe1ceb79..cbff3e8329615 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -7,8 +7,8 @@
 define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
 ; X86-LABEL: select00:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB0_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1
@@ -35,8 +35,8 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
 define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
 ; X86-LABEL: select01:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    cmpl $255, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1
@@ -63,10 +63,10 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
 define float @select02(float %a, float %b, float %c, float %eps) {
 ; X86-LABEL: select02:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    cmovael %eax, %ecx
 ; X86-NEXT:    flds (%ecx)
 ; X86-NEXT:    retl
@@ -85,10 +85,10 @@ define float @select02(float %a, float %b, float %c, float %eps) {
 define double @select03(double %a, double %b, double %c, double %eps) {
 ; X86-LABEL: select03:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    cmovael %eax, %ecx
 ; X86-NEXT:    fldl (%ecx)
 ; X86-NEXT:    retl
@@ -152,9 +152,9 @@ define i8 @select05_mem(ptr %a.0, ptr %m) {
 ; X86-AVX512F-LABEL: select05_mem:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512F-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX512F-NEXT:    kmovw %ecx, %k0
+; X86-AVX512F-NEXT:    movzbl (%eax), %eax
+; X86-AVX512F-NEXT:    kmovw %eax, %k0
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    movzbl (%eax), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
 ; X86-AVX512F-NEXT:    korw %k1, %k0, %k0
@@ -176,9 +176,9 @@ define i8 @select05_mem(ptr %a.0, ptr %m) {
 ; X86-AVX512BW-LABEL: select05_mem:
 ; X86-AVX512BW:       # %bb.0:
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512BW-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX512BW-NEXT:    kmovd %ecx, %k0
+; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
+; X86-AVX512BW-NEXT:    kmovd %eax, %k0
+; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
 ; X86-AVX512BW-NEXT:    korw %k1, %k0, %k0
@@ -227,9 +227,9 @@ define i8 @select06_mem(ptr %a.0, ptr %m) {
 ; X86-AVX512F-LABEL: select06_mem:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512F-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX512F-NEXT:    kmovw %ecx, %k0
+; X86-AVX512F-NEXT:    movzbl (%eax), %eax
+; X86-AVX512F-NEXT:    kmovw %eax, %k0
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    movzbl (%eax), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
 ; X86-AVX512F-NEXT:    kandw %k1, %k0, %k0
@@ -251,9 +251,9 @@ define i8 @select06_mem(ptr %a.0, ptr %m) {
 ; X86-AVX512BW-LABEL: select06_mem:
 ; X86-AVX512BW:       # %bb.0:
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512BW-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX512BW-NEXT:    kmovd %ecx, %k0
+; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
+; X86-AVX512BW-NEXT:    kmovd %eax, %k0
+; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
 ; X86-AVX512BW-NEXT:    kandw %k1, %k0, %k0
@@ -284,11 +284,11 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
 ; X86-AVX512F-NEXT:    kmovw %eax, %k0
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    kandnw %k0, %k1, %k0
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k2
-; X86-AVX512F-NEXT:    kandnw %k2, %k0, %k2
-; X86-AVX512F-NEXT:    kandw %k0, %k1, %k0
-; X86-AVX512F-NEXT:    korw %k2, %k0, %k0
+; X86-AVX512F-NEXT:    kandw %k1, %k2, %k1
+; X86-AVX512F-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512F-NEXT:    kmovw %k0, %eax
 ; X86-AVX512F-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-AVX512F-NEXT:    retl
@@ -311,11 +311,11 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k0
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    kandnw %k0, %k1, %k0
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k2
-; X86-AVX512BW-NEXT:    kandnw %k2, %k0, %k2
-; X86-AVX512BW-NEXT:    kandw %k0, %k1, %k0
-; X86-AVX512BW-NEXT:    korw %k2, %k0, %k0
+; X86-AVX512BW-NEXT:    kandw %k1, %k2, %k1
+; X86-AVX512BW-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512BW-NEXT:    kmovd %k0, %eax
 ; X86-AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-AVX512BW-NEXT:    retl
@@ -357,9 +357,9 @@ define i64 @pr30249() {
 define double @pr30561_f64(double %b, double %a, i1 %c) {
 ; X86-LABEL: pr30561_f64:
 ; X86:       # %bb.0:
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnel %eax, %ecx
 ; X86-NEXT:    fldl (%ecx)
 ; X86-NEXT:    retl
@@ -382,9 +382,9 @@ define double @pr30561_f64(double %b, double %a, i1 %c) {
 define float @pr30561_f32(float %b, float %a, i1 %c) {
 ; X86-LABEL: pr30561_f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnel %eax, %ecx
 ; X86-NEXT:    flds (%ecx)
 ; X86-NEXT:    retl
@@ -555,24 +555,22 @@ define <16 x i64> @narrowExtractedVectorSelect_crash(<16 x i64> %arg, <16 x i16>
 define void @vselect_v1i1(ptr %w, ptr %x, ptr %y) nounwind {
 ; X86-AVX512F-LABEL: vselect_v1i1:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    pushl %esi
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    movzbl (%eax), %esi
-; X86-AVX512F-NEXT:    kmovw %esi, %k0
-; X86-AVX512F-NEXT:    movzbl (%edx), %edx
-; X86-AVX512F-NEXT:    kmovw %edx, %k1
-; X86-AVX512F-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX512F-NEXT:    movzbl (%eax), %eax
+; X86-AVX512F-NEXT:    kmovw %eax, %k0
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT:    movzbl (%eax), %eax
+; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    kandnw %k0, %k1, %k0
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT:    movzbl (%eax), %ecx
 ; X86-AVX512F-NEXT:    kmovw %ecx, %k2
-; X86-AVX512F-NEXT:    kandnw %k1, %k2, %k1
-; X86-AVX512F-NEXT:    kandw %k2, %k0, %k0
-; X86-AVX512F-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512F-NEXT:    kandw %k1, %k2, %k1
+; X86-AVX512F-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512F-NEXT:    kshiftlw $15, %k0, %k0
 ; X86-AVX512F-NEXT:    kshiftrw $15, %k0, %k0
 ; X86-AVX512F-NEXT:    kmovw %k0, %ecx
 ; X86-AVX512F-NEXT:    movb %cl, (%eax)
-; X86-AVX512F-NEXT:    popl %esi
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: vselect_v1i1:
@@ -594,24 +592,22 @@ define void @vselect_v1i1(ptr %w, ptr %x, ptr %y) nounwind {
 ;
 ; X86-AVX512BW-LABEL: vselect_v1i1:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    pushl %esi
-; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512BW-NEXT:    movzbl (%eax), %esi
-; X86-AVX512BW-NEXT:    kmovd %esi, %k0
-; X86-AVX512BW-NEXT:    movzbl (%edx), %edx
-; X86-AVX512BW-NEXT:    kmovd %edx, %k1
-; X86-AVX512BW-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
+; X86-AVX512BW-NEXT:    kmovd %eax, %k0
+; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512BW-NEXT:    movzbl (%eax), %eax
+; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    kandnw %k0, %k1, %k0
+; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512BW-NEXT:    movzbl (%eax), %ecx
 ; X86-AVX512BW-NEXT:    kmovd %ecx, %k2
-; X86-AVX512BW-NEXT:    kandnw %k1, %k2, %k1
-; X86-AVX512BW-NEXT:    kandw %k2, %k0, %k0
-; X86-AVX512BW-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512BW-NEXT:    kandw %k1, %k2, %k1
+; X86-AVX512BW-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
 ; X86-AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
 ; X86-AVX512BW-NEXT:    kmovd %k0, %ecx
 ; X86-AVX512BW-NEXT:    movb %cl, (%eax)
-; X86-AVX512BW-NEXT:    popl %esi
 ; X86-AVX512BW-NEXT:    retl
 ;
 ; X64-AVX512BW-LABEL: vselect_v1i1:
@@ -647,12 +643,12 @@ define void @select_v1i1(ptr %w, ptr %x, ptr %y, i1 %z) nounwind {
 ; X86-AVX512F-NEXT:    jne .LBB18_1
 ; X86-AVX512F-NEXT:  # %bb.2:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512F-NEXT:    movzbl (%edx), %edx
-; X86-AVX512F-NEXT:    kmovw %edx, %k0
+; X86-AVX512F-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX512F-NEXT:    kmovw %ecx, %k0
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512F-NEXT:    movzbl (%ecx), %ecx
 ; X86-AVX512F-NEXT:    kmovw %ecx, %k1
-; X86-AVX512F-NEXT:    kxorw %k1, %k0, %k0
+; X86-AVX512F-NEXT:    kxorw %k0, %k1, %k0
 ; X86-AVX512F-NEXT:    jmp .LBB18_3
 ; X86-AVX512F-NEXT:  .LBB18_1:
 ; X86-AVX512F-NEXT:    movzbl (%eax), %ecx
@@ -692,12 +688,12 @@ define void @select_v1i1(ptr %w, ptr %x, ptr %y, i1 %z) nounwind {
 ; X86-AVX512BW-NEXT:    jne .LBB18_1
 ; X86-AVX512BW-NEXT:  # %bb.2:
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512BW-NEXT:    movzbl (%edx), %edx
-; X86-AVX512BW-NEXT:    kmovd %edx, %k0
+; X86-AVX512BW-NEXT:    movzbl (%ecx), %ecx
+; X86-AVX512BW-NEXT:    kmovd %ecx, %k0
+; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512BW-NEXT:    movzbl (%ecx), %ecx
 ; X86-AVX512BW-NEXT:    kmovd %ecx, %k1
-; X86-AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; X86-AVX512BW-NEXT:    kxorw %k0, %k1, %k0
 ; X86-AVX512BW-NEXT:    jmp .LBB18_3
 ; X86-AVX512BW-NEXT:  .LBB18_1:
 ; X86-AVX512BW-NEXT:    movzbl (%eax), %ecx
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
index 78957d10301c9..1bc70a38ff94a 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
@@ -5,12 +5,19 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW
 
 define <16 x i32> @shuffle_v8i64(<16 x i32> %t0, <16 x i32> %t1) {
-; CHECK-LABEL: shuffle_v8i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm2
-; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: shuffle_v8i64:
+; X86-AVX512BW:       # %bb.0: # %entry
+; X86-AVX512BW-NEXT:    vpsubd %zmm1, %zmm0, %zmm2
+; X86-AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; X86-AVX512BW-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,1],zmm2[2,3],zmm0[4,5],zmm2[6,7],zmm0[8,9],zmm2[10,11],zmm0[12,13],zmm2[14,15]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: shuffle_v8i64:
+; X64-AVX512BW:       # %bb.0: # %entry
+; X64-AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm2
+; X64-AVX512BW-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; X64-AVX512BW-NEXT:    vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15]
+; X64-AVX512BW-NEXT:    retq
 entry:
   %t2 = add nsw <16 x i32> %t0, %t1
   %t3 = sub nsw <16 x i32> %t0, %t1
@@ -19,12 +26,19 @@ entry:
 }
 
 define <8 x i32> @shuffle_v4i64(<8 x i32> %t0, <8 x i32> %t1) {
-; CHECK-LABEL: shuffle_v4i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: shuffle_v4i64:
+; X86-AVX512BW:       # %bb.0: # %entry
+; X86-AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; X86-AVX512BW-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; X86-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: shuffle_v4i64:
+; X64-AVX512BW:       # %bb.0: # %entry
+; X64-AVX512BW-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; X64-AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; X64-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
+; X64-AVX512BW-NEXT:    retq
 entry:
   %t2 = add nsw <8 x i32> %t0, %t1
   %t3 = sub nsw <8 x i32> %t0, %t1
@@ -33,12 +47,19 @@ entry:
 }
 
 define <4 x i32> @shuffle_v2i64(<4 x i32> %t0, <4 x i32> %t1) {
-; CHECK-LABEL: shuffle_v2i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: shuffle_v2i64:
+; X86-AVX512BW:       # %bb.0: # %entry
+; X86-AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; X86-AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X86-AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: shuffle_v2i64:
+; X64-AVX512BW:       # %bb.0: # %entry
+; X64-AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; X64-AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; X64-AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; X64-AVX512BW-NEXT:    retq
 entry:
   %t2 = add nsw <4 x i32> %t0, %t1
   %t3 = sub nsw <4 x i32> %t0, %t1
@@ -47,12 +68,19 @@ entry:
 }
 
 define <2 x i32> @shuffle_v2i32(<2 x i32> %t0, <2 x i32> %t1) {
-; CHECK-LABEL: shuffle_v2i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: shuffle_v2i32:
+; X86-AVX512BW:       # %bb.0: # %entry
+; X86-AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
+; X86-AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X86-AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: shuffle_v2i32:
+; X64-AVX512BW:       # %bb.0: # %entry
+; X64-AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; X64-AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; X64-AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
+; X64-AVX512BW-NEXT:    retq
 entry:
   %t2 = add nsw <2 x i32> %t0, %t1
   %t3 = sub nsw <2 x i32> %t0, %t1
@@ -97,13 +125,21 @@ define <64 x i8> @addb_selectw_64xi8(<64 x i8> %t0, <64 x i8> %t1) {
 }
 
 define <32 x i8> @addb_selectw_32xi8(<32 x i8> %t0, <32 x i8> %t1) {
-; CHECK-LABEL: addb_selectw_32xi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: addb_selectw_32xi8:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; X86-AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X86-AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; X86-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: addb_selectw_32xi8:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm2
+; X64-AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; X64-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; X64-AVX512BW-NEXT:    retq
   %t2 = add nsw <32 x i8> %t0, %t1
   %t3 = sub nsw <32 x i8> %t0, %t1
   %t4 = shufflevector <32 x i8> %t2, <32 x i8> %t3, <32 x i32> <i32 32, i32 33, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -111,12 +147,19 @@ define <32 x i8> @addb_selectw_32xi8(<32 x i8> %t0, <32 x i8> %t1) {
 }
 
 define <16 x i8> @addb_selectw_16xi8(<16 x i8> %t0, <16 x i8> %t1) {
-; CHECK-LABEL: addb_selectw_16xi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: addb_selectw_16xi8:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; X86-AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; X86-AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: addb_selectw_16xi8:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
+; X64-AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; X64-AVX512BW-NEXT:    retq
   %t2 = add nsw <16 x i8> %t0, %t1
   %t3 = sub nsw <16 x i8> %t0, %t1
   %t4 = shufflevector <16 x i8> %t2, <16 x i8> %t3, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -124,12 +167,19 @@ define <16 x i8> @addb_selectw_16xi8(<16 x i8> %t0, <16 x i8> %t1) {
 }
 
 define <8 x i8> @addb_selectw_8xi8(<8 x i8> %t0, <8 x i8> %t1) {
-; CHECK-LABEL: addb_selectw_8xi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
-; CHECK-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: addb_selectw_8xi8:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm2
+; X86-AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; X86-AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: addb_selectw_8xi8:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm2
+; X64-AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; X64-AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; X64-AVX512BW-NEXT:    retq
   %t2 = add nsw <8 x i8> %t0, %t1
   %t3 = sub nsw <8 x i8> %t0, %t1
   %t4 = shufflevector <8 x i8> %t2, <8 x i8> %t3, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -166,12 +216,19 @@ define <32 x i16> @addw_selectd_32xi16(<32 x i16> %t0, <32 x i16> %t1) {
 }
 
 define <16 x i16> @addw_selectd_16xi16(<16 x i16> %t0, <16 x i16> %t1) {
-; CHECK-LABEL: addw_selectd_16xi16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: addw_selectd_16xi16:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm2
+; X86-AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X86-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: addw_selectd_16xi16:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm2
+; X64-AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; X64-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; X64-AVX512BW-NEXT:    retq
   %t2 = add nsw <16 x i16> %t0, %t1
   %t3 = sub nsw <16 x i16> %t0, %t1
   %t4 = shufflevector <16 x i16> %t2, <16 x i16> %t3, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -204,12 +261,19 @@ define <16 x i32> @addd_selectq_16xi32(<16 x i32> %t0, <16 x i32> %t1) {
 }
 
 define <8 x i32> @addd_selectq_8xi32(<8 x i32> %t0, <8 x i32> %t1) {
-; CHECK-LABEL: addd_selectq_8xi32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
-; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-AVX512BW-LABEL: addd_selectq_8xi32:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; X86-AVX512BW-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; X86-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: addd_selectq_8xi32:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
+; X64-AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; X64-AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; X64-AVX512BW-NEXT:    retq
   %t2 = add nsw <8 x i32> %t0, %t1
   %t3 = sub nsw <8 x i32> %t0, %t1
   %t4 = shufflevector <8 x i32> %t2, <8 x i32> %t3, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/avx512bf16-mov.ll b/llvm/test/CodeGen/X86/avx512bf16-mov.ll
index da52c42a41600..f1360c5ae945f 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-mov.ll
@@ -19,15 +19,15 @@ define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) {
 ; X86-LABEL: funbf16:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovups (%eax), %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovups (%ecx), %xmm0
-; X86-NEXT:    vmovups %xmm0, (%eax)
-; X86-NEXT:    vmovaps (%ecx), %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%eax)
-; X86-NEXT:    vmovups (%ecx), %ymm0
-; X86-NEXT:    vmovups %ymm0, (%eax)
-; X86-NEXT:    vmovaps (%ecx), %ymm0
-; X86-NEXT:    vmovaps %ymm0, (%eax)
+; X86-NEXT:    vmovups %xmm0, (%ecx)
+; X86-NEXT:    vmovaps (%eax), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%ecx)
+; X86-NEXT:    vmovups (%eax), %ymm0
+; X86-NEXT:    vmovups %ymm0, (%ecx)
+; X86-NEXT:    vmovaps (%eax), %ymm0
+; X86-NEXT:    vmovaps %ymm0, (%ecx)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll
index f51fe141fed21..693acf74b9601 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll
@@ -246,9 +246,9 @@ entry:
 define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 {
 ; X86-LABEL: test_mm128_cvtneps2bf16_128_select:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
 ; X86-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index b3c261147c773..ae251a44c826d 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -234,9 +234,9 @@ entry:
 define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 {
 ; X86-LABEL: test_mm128_cvtneps2bf16_128_select:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vcvtneps2bf16 %xmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc9]
 ; X86-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index ada2c8d53aa53..25b6e13aa0602 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -16,12 +16,12 @@ define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C,
 ; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
 ; X86-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X86-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k1
-; X86-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k2
-; X86-NEXT:    kandd %k0, %k2, %k0
+; X86-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k1
+; X86-NEXT:    kandd %k0, %k1, %k0
 ; X86-NEXT:    kmovd %k0, %eax
-; X86-NEXT:    kshiftrq $32, %k2, %k0
-; X86-NEXT:    kandd %k1, %k0, %k0
+; X86-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k0
+; X86-NEXT:    kshiftrq $32, %k1, %k1
+; X86-NEXT:    kandd %k0, %k1, %k0
 ; X86-NEXT:    kmovd %k0, %edx
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -582,9 +582,10 @@ define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
 ; X86-NEXT:    kshiftrq $32, %k0, %k1
+; X86-NEXT:    kmovd %k0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    kmovd %k1, %edx
-; X86-NEXT:    kmovd %k0, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -676,9 +677,10 @@ define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64>
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
 ; X86-NEXT:    kshiftrq $32, %k0, %k1
+; X86-NEXT:    kmovd %k0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    kmovd %k1, %edx
-; X86-NEXT:    kmovd %k0, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 51ffeca52a665..d1b7f48b308aa 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -7,9 +7,9 @@ declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
 define i32 @test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) nounwind {
 ; X86-LABEL: test_int_x86_avx512_kunpck_wd:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04]
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
-; X86-NEXT:    kunpckwd %k1, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc1]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # encoding: [0xc5,0xf8,0x90,0x44,0x24,0x08]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    kunpckwd %k0, %k1, %k0 # encoding: [0xc5,0xf4,0x4b,0xc0]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -29,8 +29,8 @@ declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
 define i64 @test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) nounwind {
 ; X86-LABEL: test_int_x86_avx512_kunpck_qd:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_kunpck_qd:
@@ -49,8 +49,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x5c,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpblendmb %zmm3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0xcb]
 ; X86-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
@@ -107,10 +107,10 @@ declare void @llvm.x86.avx512.mask.storeu.b.512(ptr, <64 x i8>, i64)
 define void @test_int_x86_avx512_mask_storeu_b_512(ptr %ptr1, ptr %ptr2, <64 x i8> %x1, i64 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu8 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu8 %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -132,10 +132,10 @@ declare void @llvm.x86.avx512.mask.storeu.w.512(ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_storeu_w_512(ptr %ptr1, ptr %ptr2, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu16 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu16 %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -157,12 +157,12 @@ declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(ptr, <32 x i16>, i32)
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_512(ptr %ptr, ptr %ptr2, <32 x i16> %x1, i32 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%eax), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vpblendmw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x66,0x08]
-; X86-NEXT:    vmovdqu16 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x11]
+; X86-NEXT:    vpblendmw (%ecx), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x66,0x09]
+; X86-NEXT:    vmovdqu16 (%eax), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x10]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_w_512:
@@ -186,12 +186,12 @@ declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(ptr, <64 x i8>, i64)
 define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512(ptr %ptr, ptr %ptr2, <64 x i8> %x1, i64 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%eax), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
-; X86-NEXT:    vpblendmb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x08]
-; X86-NEXT:    vmovdqu8 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x11]
+; X86-NEXT:    vpblendmb (%ecx), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x09]
+; X86-NEXT:    vmovdqu8 (%eax), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x10]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_b_512:
@@ -457,9 +457,10 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    andl %ecx, %eax # encoding: [0x21,0xc8]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04]
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -535,9 +536,10 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    andl %ecx, %eax # encoding: [0x21,0xc8]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04]
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1096,8 +1098,8 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psrl_wi_
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpsrlw $3, %zmm0, %zmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x49,0x71,0xd0,0x03]
-; X86-NEXT:    vpsrlw $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x71,0xd0,0x04]
 ; X86-NEXT:    vpsrlw $5, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xc9,0x71,0xd0,0x05]
+; X86-NEXT:    vpsrlw $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x71,0xd0,0x04]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1858,14 +1860,14 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc9]
+; X86-NEXT:    kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
-; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
 ; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
@@ -1944,34 +1946,34 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xd1]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
-; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
-; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
-; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
 ; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
-; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
@@ -2033,14 +2035,14 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc9]
+; X86-NEXT:    kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
-; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
-; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
 ; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
@@ -2119,34 +2121,34 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xd1]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    kmovd %k2, %esi # encoding: [0xc5,0xfb,0x93,0xf2]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
-; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
-; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    addl %esi, %edx # encoding: [0x01,0xf2]
-; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
+; X86-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
-; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
-; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    adcl %ecx, %eax # encoding: [0x11,0xc8]
 ; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
 ; X86-NEXT:    kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20]
 ; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
 ; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
-; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    adcl %eax, %ecx # encoding: [0x11,0xc1]
 ; X86-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
@@ -2206,9 +2208,9 @@ declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nou
 define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-LABEL: test_cmp_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
@@ -2271,9 +2273,9 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
-; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
@@ -2339,9 +2341,9 @@ declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) no
 define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-LABEL: test_ucmp_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
@@ -2404,9 +2406,9 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
-; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
index 5b58233ac51aa..c74a90f45b46b 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -3,16 +3,27 @@
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 define i32 @test_int_x86_avx512_kadd_d(<32 x i16> %A, <32 x i16> %B) nounwind {
-; CHECK-LABEL: test_int_x86_avx512_kadd_d:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0]
-; CHECK-NEXT:    vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9]
-; CHECK-NEXT:    kaddd %k1, %k0, %k0 # encoding: [0xc4,0xe1,0xfd,0x4a,0xc1]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    kortestd %k0, %k0 # encoding: [0xc4,0xe1,0xf9,0x98,0xc0]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_int_x86_avx512_kadd_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestmw %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc1]
+; X86-NEXT:    vptestmw %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc8]
+; X86-NEXT:    kaddd %k0, %k1, %k0 # encoding: [0xc4,0xe1,0xf5,0x4a,0xc0]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    kortestd %k0, %k0 # encoding: [0xc4,0xe1,0xf9,0x98,0xc0]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_kadd_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0]
+; X64-NEXT:    vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9]
+; X64-NEXT:    kaddd %k1, %k0, %k0 # encoding: [0xc4,0xe1,0xfd,0x4a,0xc1]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    kortestd %k0, %k0 # encoding: [0xc4,0xe1,0xf9,0x98,0xc0]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = icmp ne <32 x i16> %A, zeroinitializer
   %1 = icmp ne <32 x i16> %B, zeroinitializer
@@ -27,9 +38,9 @@ declare <32 x i1> @llvm.x86.avx512.kadd.d(<32 x i1>, <32 x i1>)
 define i32 @test_int_x86_avx512_kadd_q(<64 x i8> %A, <64 x i8> %B) nounwind {
 ; X86-LABEL: test_int_x86_avx512_kadd_q:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0]
-; X86-NEXT:    vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9]
-; X86-NEXT:    kaddq %k1, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4a,0xc1]
+; X86-NEXT:    vptestmb %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc1]
+; X86-NEXT:    vptestmb %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc8]
+; X86-NEXT:    kaddq %k0, %k1, %k0 # encoding: [0xc4,0xe1,0xf4,0x4a,0xc0]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
 ; X86-NEXT:    kortestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x98,0xc1]
@@ -59,15 +70,25 @@ entry:
 declare <64 x i1> @llvm.x86.avx512.kadd.q(<64 x i1>, <64 x i1>)
 
 define i32 @test_x86_avx512_ktestc_d(<32 x i16> %A, <32 x i16> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestc_d:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0]
-; CHECK-NEXT:    vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x99,0xc1]
-; CHECK-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestc_d:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmw %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc1]
+; X86-NEXT:    vptestmw %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestd %k0, %k1 # encoding: [0xc4,0xe1,0xf9,0x99,0xc8]
+; X86-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestc_d:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0]
+; X64-NEXT:    vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x99,0xc1]
+; X64-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <32 x i16> %A, zeroinitializer
   %2 = icmp ne <32 x i16> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestc.d(<32 x i1> %1, <32 x i1> %2) ; <i32> [#uses=1]
@@ -76,15 +97,25 @@ define i32 @test_x86_avx512_ktestc_d(<32 x i16> %A, <32 x i16> %B) {
 declare i32 @llvm.x86.avx512.ktestc.d(<32 x i1>, <32 x i1>) nounwind readnone
 
 define i32 @test_x86_avx512_ktestz_d(<32 x i16> %A, <32 x i16> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestz_d:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0]
-; CHECK-NEXT:    vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x99,0xc1]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestz_d:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmw %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc1]
+; X86-NEXT:    vptestmw %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestd %k0, %k1 # encoding: [0xc4,0xe1,0xf9,0x99,0xc8]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestz_d:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmw %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc0]
+; X64-NEXT:    vptestmw %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x26,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestd %k1, %k0 # encoding: [0xc4,0xe1,0xf9,0x99,0xc1]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <32 x i16> %A, zeroinitializer
   %2 = icmp ne <32 x i16> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestz.d(<32 x i1> %1, <32 x i1> %2) ; <i32> [#uses=1]
@@ -93,15 +124,25 @@ define i32 @test_x86_avx512_ktestz_d(<32 x i16> %A, <32 x i16> %B) {
 declare i32 @llvm.x86.avx512.ktestz.d(<32 x i1>, <32 x i1>) nounwind readnone
 
 define i32 @test_x86_avx512_ktestc_q(<64 x i8> %A, <64 x i8> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestc_q:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0]
-; CHECK-NEXT:    vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestq %k1, %k0 # encoding: [0xc4,0xe1,0xf8,0x99,0xc1]
-; CHECK-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestc_q:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmb %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc1]
+; X86-NEXT:    vptestmb %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestq %k0, %k1 # encoding: [0xc4,0xe1,0xf8,0x99,0xc8]
+; X86-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestc_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0]
+; X64-NEXT:    vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestq %k1, %k0 # encoding: [0xc4,0xe1,0xf8,0x99,0xc1]
+; X64-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <64 x i8> %A, zeroinitializer
   %2 = icmp ne <64 x i8> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestc.q(<64 x i1> %1, <64 x i1> %2) ; <i32> [#uses=1]
@@ -110,15 +151,25 @@ define i32 @test_x86_avx512_ktestc_q(<64 x i8> %A, <64 x i8> %B) {
 declare i32 @llvm.x86.avx512.ktestc.q(<64 x i1>, <64 x i1>) nounwind readnone
 
 define i32 @test_x86_avx512_ktestz_q(<64 x i8> %A, <64 x i8> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestz_q:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0]
-; CHECK-NEXT:    vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestq %k1, %k0 # encoding: [0xc4,0xe1,0xf8,0x99,0xc1]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestz_q:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmb %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc1]
+; X86-NEXT:    vptestmb %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestq %k0, %k1 # encoding: [0xc4,0xe1,0xf8,0x99,0xc8]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestz_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmb %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc0]
+; X64-NEXT:    vptestmb %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x26,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestq %k1, %k0 # encoding: [0xc4,0xe1,0xf8,0x99,0xc1]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <64 x i8> %A, zeroinitializer
   %2 = icmp ne <64 x i8> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestz.q(<64 x i1> %1, <64 x i1> %2) ; <i32> [#uses=1]
@@ -1017,9 +1068,9 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(ptr %ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_pmov_wb_mem_512(ptr %ptr, <32 x i16> %x1, i32 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovwb %zmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x48,0x30,0x00]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpmovwb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1086,9 +1137,9 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(ptr %ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(ptr %ptr, <32 x i16> %x1, i32 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovswb %zmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x48,0x20,0x00]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpmovswb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1155,9 +1206,9 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(ptr %ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(ptr %ptr, <32 x i16> %x1, i32 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovuswb %zmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x48,0x10,0x00]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpmovuswb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index ae710cc40a522..252b919c48f25 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -37,9 +37,9 @@ declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x5c,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x5c,0x24,0x04]
 ; X86-NEXT:    vpblendmw %xmm3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x66,0xcb]
 ; X86-NEXT:    vmovdqu16 %xmm3, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
@@ -400,10 +400,10 @@ declare void @llvm.x86.avx512.mask.storeu.b.128(ptr, <16 x i8>, i16)
 define void at test_int_x86_avx512_mask_storeu_b_128(ptr %ptr1, ptr %ptr2, <16 x i8> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu8 %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu8 %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -423,10 +423,10 @@ declare void @llvm.x86.avx512.mask.storeu.b.256(ptr, <32 x i8>, i32)
 define void at test_int_x86_avx512_mask_storeu_b_256(ptr %ptr1, ptr %ptr2, <32 x i8> %x1, i32 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu8 %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu8 %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -448,11 +448,11 @@ declare void @llvm.x86.avx512.mask.storeu.w.128(ptr, <8 x i16>, i8)
 define void at test_int_x86_avx512_mask_storeu_w_128(ptr %ptr1, ptr %ptr2, <8 x i16> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu16 %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X86-NEXT:    vmovdqu16 %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7f,0x01]
 ; X86-NEXT:    vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -472,10 +472,10 @@ declare void @llvm.x86.avx512.mask.storeu.w.256(ptr, <16 x i16>, i16)
 define void at test_int_x86_avx512_mask_storeu_w_256(ptr %ptr1, ptr %ptr2, <16 x i16> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vmovdqu16 %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu16 %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -497,13 +497,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(ptr, <8 x i16>, i8)
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_loadu_w_128(ptr %ptr, ptr %ptr2, <8 x i16> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
-; X86-NEXT:    vpblendmw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x66,0x08]
-; X86-NEXT:    vmovdqu16 (%ecx), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0x11]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    vpblendmw (%ecx), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x66,0x09]
+; X86-NEXT:    vmovdqu16 (%eax), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0x10]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_w_128:
@@ -527,12 +527,12 @@ declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(ptr, <16 x i16>, i16)
 define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_loadu_w_256(ptr %ptr, ptr %ptr2, <16 x i16> %x1, i16 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vpblendmw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x66,0x08]
-; X86-NEXT:    vmovdqu16 (%ecx), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x11]
+; X86-NEXT:    vpblendmw (%ecx), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x66,0x09]
+; X86-NEXT:    vmovdqu16 (%eax), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x10]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_w_256:
@@ -556,12 +556,12 @@ declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(ptr, <16 x i8>, i16)
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_loadu_b_128(ptr %ptr, ptr %ptr2, <16 x i8> %x1, i16 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vpblendmb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x66,0x08]
-; X86-NEXT:    vmovdqu8 (%ecx), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x11]
+; X86-NEXT:    vpblendmb (%ecx), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x66,0x09]
+; X86-NEXT:    vmovdqu8 (%eax), %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x10]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_b_128:
@@ -585,12 +585,12 @@ declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(ptr, <32 x i8>, i32)
 define { <32 x i8>, <32 x i8>, <32 x i8> } @test_int_x86_avx512_mask_loadu_b_256(ptr %ptr, ptr %ptr2, <32 x i8> %x1, i32 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
-; X86-NEXT:    vpblendmb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x66,0x08]
-; X86-NEXT:    vmovdqu8 (%ecx), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x11]
+; X86-NEXT:    vpblendmb (%ecx), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x66,0x09]
+; X86-NEXT:    vmovdqu8 (%eax), %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x10]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_loadu_b_256:
@@ -1505,9 +1505,9 @@ define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1526,9 +1526,9 @@ define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16
 define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1707,9 +1707,9 @@ define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1728,9 +1728,9 @@ define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16
 define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2206,9 +2206,9 @@ define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpmullw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2227,9 +2227,9 @@ define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i
 define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpmullw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3154,8 +3154,8 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_psrl_wi_128
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vpsrlw $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x71,0xd0,0x03]
-; X86-NEXT:    vpsrlw $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xd0,0x04]
 ; X86-NEXT:    vpsrlw $5, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x71,0xd0,0x05]
+; X86-NEXT:    vpsrlw $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x71,0xd0,0x04]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3185,8 +3185,8 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_psrl_wi_
 ; X86-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpsrlw $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x71,0xd0,0x03]
-; X86-NEXT:    vpsrlw $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x71,0xd0,0x04]
 ; X86-NEXT:    vpsrlw $5, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xd0,0x05]
+; X86-NEXT:    vpsrlw $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x71,0xd0,0x04]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3816,9 +3816,9 @@ define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3837,9 +3837,9 @@ define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x i
 define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3874,9 +3874,9 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3897,9 +3897,9 @@ define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4339,9 +4339,9 @@ define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4360,9 +4360,9 @@ define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4397,9 +4397,9 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4420,9 +4420,9 @@ define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4800,47 +4800,32 @@ declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32
 define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-LABEL: test_cmp_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx # encoding: [0x53]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    .cfi_offset %esi, -16
-; X86-NEXT:    .cfi_offset %edi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
-; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; X86-NEXT:    vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    vpcmpnltb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05]
-; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
-; X86-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
-; X86-NEXT:    kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8]
-; X86-NEXT:    vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; X86-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; X86-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x02]
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; X86-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
-; X86-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[3]
-; X86-NEXT:    vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
-; X86-NEXT:    vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
-; X86-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; X86-NEXT:    # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
-; X86-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; X86-NEXT:    # xmm1 = xmm1[0],xmm2[0]
-; X86-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebx # encoding: [0x5b]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01]
+; X86-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x02]
+; X86-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x76,0xdb]
+; X86-NEXT:    vpblendd $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x02,0xd3,0x08]
+; X86-NEXT:    # xmm2 = xmm2[0,1,2],xmm3[3]
+; X86-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd8]
+; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xe0]
+; X86-NEXT:    vpunpckldq %xmm3, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x62,0xdb]
+; X86-NEXT:    # xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X86-NEXT:    vpunpcklqdq %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x6c,0xc0]
+; X86-NEXT:    # xmm0 = xmm3[0],xmm0[0]
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_cmp_b_256:
@@ -4894,52 +4879,32 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
 ; X86-LABEL: test_mask_cmp_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %ebx # encoding: [0x53]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; X86-NEXT:    vpcmpnltb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
-; X86-NEXT:    vpcmpnltb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05]
-; X86-NEXT:    kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01]
 ; X86-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
-; X86-NEXT:    kmovd %k0, %ebp # encoding: [0xc5,0xfb,0x93,0xe8]
-; X86-NEXT:    vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
-; X86-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x01]
-; X86-NEXT:    vpinsrd $2, %ebp, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc5,0x02]
-; X86-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
-; X86-NEXT:    vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
-; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
-; X86-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; X86-NEXT:    # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    vmovd %esi, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd6]
-; X86-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; X86-NEXT:    # xmm1 = xmm1[0],xmm2[0]
-; X86-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %ebx # encoding: [0x5b]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x02]
+; X86-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03]
+; X86-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd8]
+; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xe0]
+; X86-NEXT:    vpunpckldq %xmm3, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x62,0xdb]
+; X86-NEXT:    # xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X86-NEXT:    vpunpcklqdq %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x6c,0xc0]
+; X86-NEXT:    # xmm0 = xmm3[0],xmm0[0]
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mask_cmp_b_256:
@@ -4994,47 +4959,32 @@ declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) noun
 define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 ; X86-LABEL: test_ucmp_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx # encoding: [0x53]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    .cfi_offset %esi, -16
-; X86-NEXT:    .cfi_offset %edi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
-; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; X86-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
-; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
-; X86-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
-; X86-NEXT:    kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8]
-; X86-NEXT:    vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; X86-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
-; X86-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x02]
-; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; X86-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
-; X86-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[3]
-; X86-NEXT:    vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
-; X86-NEXT:    vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0]
-; X86-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; X86-NEXT:    # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
-; X86-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; X86-NEXT:    # xmm1 = xmm1[0],xmm2[0]
-; X86-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebx # encoding: [0x5b]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01]
+; X86-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x02]
+; X86-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x76,0xdb]
+; X86-NEXT:    vpblendd $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x02,0xd3,0x08]
+; X86-NEXT:    # xmm2 = xmm2[0,1,2],xmm3[3]
+; X86-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd8]
+; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xe0]
+; X86-NEXT:    vpunpckldq %xmm3, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x62,0xdb]
+; X86-NEXT:    # xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X86-NEXT:    vpunpcklqdq %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x6c,0xc0]
+; X86-NEXT:    # xmm0 = xmm3[0],xmm0[0]
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_ucmp_b_256:
@@ -5088,52 +5038,32 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
 define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
 ; X86-LABEL: test_mask_ucmp_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %ebx # encoding: [0x53]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; X86-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
-; X86-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
-; X86-NEXT:    kmovd %k0, %ebx # encoding: [0xc5,0xfb,0x93,0xd8]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01]
 ; X86-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
-; X86-NEXT:    kmovd %k0, %ebp # encoding: [0xc5,0xfb,0x93,0xe8]
-; X86-NEXT:    vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
-; X86-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc3,0x01]
-; X86-NEXT:    vpinsrd $2, %ebp, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc5,0x02]
-; X86-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
-; X86-NEXT:    vmovd %edx, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xca]
-; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
-; X86-NEXT:    vpunpckldq %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x62,0xc9]
-; X86-NEXT:    # xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    vmovd %esi, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd6]
-; X86-NEXT:    vpunpcklqdq %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xca]
-; X86-NEXT:    # xmm1 = xmm1[0],xmm2[0]
-; X86-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %ebx # encoding: [0x5b]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x02]
+; X86-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x03]
+; X86-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd8]
+; X86-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xe0]
+; X86-NEXT:    vpunpckldq %xmm3, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x62,0xdb]
+; X86-NEXT:    # xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X86-NEXT:    vpunpcklqdq %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x6c,0xc0]
+; X86-NEXT:    # xmm0 = xmm3[0],xmm0[0]
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc2,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mask_ucmp_b_256:
@@ -5186,31 +5116,57 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
 declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
 
 define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: test_cmp_w_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
-; CHECK-NEXT:    vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8]
-; CHECK-NEXT:    vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05]
-; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9]
-; CHECK-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
-; CHECK-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_w_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01]
+; X86-NEXT:    vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02]
+; X86-NEXT:    vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04]
+; X86-NEXT:    vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05]
+; X86-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X86-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_w_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
+; X64-NEXT:    vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8]
+; X64-NEXT:    vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02]
+; X64-NEXT:    vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05]
+; X64-NEXT:    vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9]
+; X64-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X64-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
@@ -5235,24 +5191,24 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask)
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
-; X86-NEXT:    vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0]
-; X86-NEXT:    vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02]
-; X86-NEXT:    vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05]
-; X86-NEXT:    vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9]
-; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2]
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT:    kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT:    kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT:    kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd]
-; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01]
+; X86-NEXT:    vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02]
+; X86-NEXT:    vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04]
+; X86-NEXT:    vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05]
+; X86-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06]
 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5303,31 +5259,57 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask)
 declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK-LABEL: test_ucmp_w_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
-; CHECK-NEXT:    vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01]
-; CHECK-NEXT:    vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05]
-; CHECK-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06]
-; CHECK-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
-; CHECK-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_w_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01]
+; X86-NEXT:    vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02]
+; X86-NEXT:    vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X86-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_w_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
+; X64-NEXT:    vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01]
+; X64-NEXT:    vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02]
+; X64-NEXT:    vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05]
+; X64-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06]
+; X64-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X64-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
@@ -5352,24 +5334,24 @@ define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
-; X86-NEXT:    vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01]
-; X86-NEXT:    vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02]
-; X86-NEXT:    vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05]
-; X86-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06]
-; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2]
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT:    kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT:    kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT:    kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd]
-; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01]
+; X86-NEXT:    vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02]
+; X86-NEXT:    vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04]
+; X86-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05]
+; X86-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06]
 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5420,30 +5402,55 @@ define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask
 declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
 
 define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_cmp_b_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
-; CHECK-NEXT:    vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8]
-; CHECK-NEXT:    vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05]
-; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9]
-; CHECK-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
-; CHECK-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_b_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01]
+; X86-NEXT:    vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02]
+; X86-NEXT:    vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04]
+; X86-NEXT:    vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05]
+; X86-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X86-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_b_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
+; X64-NEXT:    vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8]
+; X64-NEXT:    vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02]
+; X64-NEXT:    vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05]
+; X64-NEXT:    vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9]
+; X64-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X64-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
@@ -5468,24 +5475,24 @@ define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
-; X86-NEXT:    vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0]
-; X86-NEXT:    vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02]
-; X86-NEXT:    vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05]
-; X86-NEXT:    vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9]
-; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2]
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT:    kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT:    kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT:    kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd]
-; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01]
+; X86-NEXT:    vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02]
+; X86-NEXT:    vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04]
+; X86-NEXT:    vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05]
+; X86-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06]
 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -5534,30 +5541,55 @@ define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
 declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
 
 define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_ucmp_b_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
-; CHECK-NEXT:    vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01]
-; CHECK-NEXT:    vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05]
-; CHECK-NEXT:    vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06]
-; CHECK-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
-; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
-; CHECK-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_b_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01]
+; X86-NEXT:    vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02]
+; X86-NEXT:    vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04]
+; X86-NEXT:    vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05]
+; X86-NEXT:    vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X86-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_b_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
+; X64-NEXT:    vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01]
+; X64-NEXT:    vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02]
+; X64-NEXT:    vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05]
+; X64-NEXT:    vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06]
+; X64-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; X64-NEXT:    # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
@@ -5582,24 +5614,24 @@ define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask)
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
-; X86-NEXT:    vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01]
-; X86-NEXT:    vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02]
-; X86-NEXT:    vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05]
-; X86-NEXT:    vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06]
-; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2]
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
-; X86-NEXT:    kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
-; X86-NEXT:    kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
-; X86-NEXT:    kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd]
-; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
-; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01]
+; X86-NEXT:    vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02]
+; X86-NEXT:    vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04]
+; X86-NEXT:    vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05]
+; X86-NEXT:    vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06]
 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -5648,30 +5680,55 @@ define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask)
 declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
 
 define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_cmp_w_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
-; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpcmpgtw %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x65,0xc0]
-; CHECK-NEXT:    vpcmplew %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x02]
-; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x04]
-; CHECK-NEXT:    vpcmpnltw %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x05]
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k4 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xe1]
-; CHECK-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
-; CHECK-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_w_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtw %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmplew %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_w_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtw %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x65,0xc0]
+; X64-NEXT:    vpcmplew %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x02]
+; X64-NEXT:    vpcmpneqw %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x04]
+; X64-NEXT:    vpcmpnltw %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x05]
+; X64-NEXT:    vpcmpgtw %xmm1, %xmm0, %k4 # encoding: [0x62,0xf1,0x7d,0x08,0x65,0xe1]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; X64-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
+; X64-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
@@ -5696,25 +5753,25 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpgtw %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x65,0xc0]
-; X86-NEXT:    vpcmplew %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd1,0x02]
-; X86-NEXT:    vpcmpneqw %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x04]
-; X86-NEXT:    vpcmpnltw %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe1,0x05]
-; X86-NEXT:    vpcmpgtw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc9]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01]
-; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
-; X86-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
-; X86-NEXT:    kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
-; X86-NEXT:    kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
-; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT:    movzbl %dl, %edx # encoding: [0x0f,0xb6,0xd2]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01]
+; X86-NEXT:    vpcmplew %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02]
+; X86-NEXT:    vpcmpneqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04]
+; X86-NEXT:    vpcmpnltw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05]
+; X86-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -5764,30 +5821,55 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_ucmp_w_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
-; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpcmpltuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleuw %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x02]
-; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x04]
-; CHECK-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x05]
-; CHECK-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x06]
-; CHECK-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
-; CHECK-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_w_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_w_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltuw %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x01]
+; X64-NEXT:    vpcmpleuw %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x02]
+; X64-NEXT:    vpcmpneqw %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x04]
+; X64-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x05]
+; X64-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x06]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; X64-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
+; X64-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
@@ -5812,25 +5894,25 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X86-NEXT:    vpcmpltuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x01]
-; X86-NEXT:    vpcmpleuw %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x02]
-; X86-NEXT:    vpcmpneqw %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x04]
-; X86-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe1,0x05]
-; X86-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x06]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01]
-; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
-; X86-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
-; X86-NEXT:    kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb]
-; X86-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
-; X86-NEXT:    kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc]
-; X86-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
-; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
-; X86-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT:    movzbl %dl, %edx # encoding: [0x0f,0xb6,0xd2]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01]
+; X86-NEXT:    vpcmpleuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02]
+; X86-NEXT:    vpcmpneqw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04]
+; X86-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05]
+; X86-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -7056,9 +7138,9 @@ define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_adds_epu16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -7077,9 +7159,9 @@ define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i1
 define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_adds_epu16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -7258,9 +7340,9 @@ define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_subs_epu16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -7279,9 +7361,9 @@ define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i1
 define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_subs_epu16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -7856,9 +7938,9 @@ define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_adds_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -7877,9 +7959,9 @@ define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i1
 define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_adds_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -8039,9 +8121,9 @@ define <8 x i16> @test_test_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %
 define <8 x i16> @test_test_subs_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_test_subs_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -8062,9 +8144,9 @@ define <8 x i16> @test_test_subs_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i1
 define <8 x i16> @test_test_subs_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_test_subs_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -8390,9 +8472,9 @@ define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_subs_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -8411,9 +8493,9 @@ define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i1
 define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_subs_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -8573,9 +8655,9 @@ define <8 x i16> @test_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
 define <8 x i16> @test_adds_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_adds_epi16_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -8596,9 +8678,9 @@ define <8 x i16> @test_adds_epi16_rmk_128(<8 x i16> %a, ptr %ptr_b, <8 x i16> %p
 define <8 x i16> @test_adds_epi16_rmkz_128(<8 x i16> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_adds_epi16_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 01b153b4fbc61..43ce4ec48974a 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -70,9 +70,9 @@ define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -93,9 +93,9 @@ define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x i
 define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -132,9 +132,9 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -157,9 +157,9 @@ define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packs_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -633,9 +633,9 @@ define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -656,9 +656,9 @@ define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -695,9 +695,9 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -720,9 +720,9 @@ define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_packus_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1517,10 +1517,9 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmov_wb_128
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3]
 ; X86-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
 ; X86-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
@@ -1545,10 +1544,10 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(ptr %ptr, <8 x i16>, i8)
 define void @test_int_x86_avx512_mask_pmov_wb_mem_128(ptr %ptr, <8 x i16> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovwb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x30,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpmovwb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1570,10 +1569,9 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovs_wb_12
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3]
 ; X86-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
 ; X86-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
@@ -1598,10 +1596,10 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(ptr %ptr, <8 x i16>, i8)
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(ptr %ptr, <8 x i16> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovswb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x20,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpmovswb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1623,10 +1621,9 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovus_wb_1
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3]
 ; X86-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
 ; X86-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
-; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
+; X86-NEXT:    vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
@@ -1651,10 +1648,10 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(ptr %ptr, <8 x i16>, i8)
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(ptr %ptr, <8 x i16> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovuswb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x10,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpmovuswb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1726,9 +1723,9 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(ptr %ptr, <16 x i16>, i16)
 define void @test_int_x86_avx512_mask_pmov_wb_mem_256(ptr %ptr, <16 x i16> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovwb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x30,0x00]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpmovwb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1800,9 +1797,9 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(ptr %ptr, <16 x i16>, i16)
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(ptr %ptr, <16 x i16> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovswb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x20,0x00]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpmovswb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1874,9 +1871,9 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(ptr %ptr, <16 x i16>, i16)
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(ptr %ptr, <16 x i16> %x1, i16 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovuswb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x10,0x00]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
 ; X86-NEXT:    vpmovuswb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll
index 70f60c802a2d5..5a4cb028e1ca1 100644
--- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -5,16 +5,27 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512DQVL
 
 define i32 @test_int_x86_avx512_kadd_w(<16 x i32> %A, <16 x i32> %B) nounwind {
-; CHECK-LABEL: test_int_x86_avx512_kadd_w:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
-; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
-; CHECK-NEXT:    kaddw %k1, %k0, %k0 # encoding: [0xc5,0xfc,0x4a,0xc1]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    kortestw %k0, %k0 # encoding: [0xc5,0xf8,0x98,0xc0]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_int_x86_avx512_kadd_w:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestmd %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc1]
+; X86-NEXT:    vptestmd %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
+; X86-NEXT:    kaddw %k0, %k1, %k0 # encoding: [0xc5,0xf4,0x4a,0xc0]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    kortestw %k0, %k0 # encoding: [0xc5,0xf8,0x98,0xc0]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_kadd_w:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
+; X64-NEXT:    vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
+; X64-NEXT:    kaddw %k1, %k0, %k0 # encoding: [0xc5,0xfc,0x4a,0xc1]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    kortestw %k0, %k0 # encoding: [0xc5,0xf8,0x98,0xc0]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = icmp ne <16 x i32> %A, zeroinitializer
   %1 = icmp ne <16 x i32> %B, zeroinitializer
@@ -27,16 +38,27 @@ entry:
 declare <16 x i1> @llvm.x86.avx512.kadd.w(<16 x i1>, <16 x i1>)
 
 define i32 @test_int_x86_avx512_kadd_b(<8 x i64> %A, <8 x i64> %B) nounwind {
-; CHECK-LABEL: test_int_x86_avx512_kadd_b:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
-; CHECK-NEXT:    kaddb %k1, %k0, %k0 # encoding: [0xc5,0xfd,0x4a,0xc1]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    kortestb %k0, %k0 # encoding: [0xc5,0xf9,0x98,0xc0]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_int_x86_avx512_kadd_b:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestmq %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc1]
+; X86-NEXT:    vptestmq %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
+; X86-NEXT:    kaddb %k0, %k1, %k0 # encoding: [0xc5,0xf5,0x4a,0xc0]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    kortestb %k0, %k0 # encoding: [0xc5,0xf9,0x98,0xc0]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_kadd_b:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
+; X64-NEXT:    vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
+; X64-NEXT:    kaddb %k1, %k0, %k0 # encoding: [0xc5,0xfd,0x4a,0xc1]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    kortestb %k0, %k0 # encoding: [0xc5,0xf9,0x98,0xc0]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = icmp ne <8 x i64> %A, zeroinitializer
   %1 = icmp ne <8 x i64> %B, zeroinitializer
@@ -49,15 +71,25 @@ entry:
 declare <8 x i1> @llvm.x86.avx512.kadd.b(<8 x i1>, <8 x i1>)
 
 define i32 @test_x86_avx512_ktestc_w(<16 x i32> %A, <16 x i32> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestc_w:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
-; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestw %k1, %k0 # encoding: [0xc5,0xf8,0x99,0xc1]
-; CHECK-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestc_w:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmd %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc1]
+; X86-NEXT:    vptestmd %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestw %k0, %k1 # encoding: [0xc5,0xf8,0x99,0xc8]
+; X86-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestc_w:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
+; X64-NEXT:    vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestw %k1, %k0 # encoding: [0xc5,0xf8,0x99,0xc1]
+; X64-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <16 x i32> %A, zeroinitializer
   %2 = icmp ne <16 x i32> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestc.w(<16 x i1> %1, <16 x i1> %2) ; <i32> [#uses=1]
@@ -66,15 +98,25 @@ define i32 @test_x86_avx512_ktestc_w(<16 x i32> %A, <16 x i32> %B) {
 declare i32 @llvm.x86.avx512.ktestc.w(<16 x i1>, <16 x i1>) nounwind readnone
 
 define i32 @test_x86_avx512_ktestz_w(<16 x i32> %A, <16 x i32> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestz_w:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
-; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestw %k1, %k0 # encoding: [0xc5,0xf8,0x99,0xc1]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestz_w:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmd %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc1]
+; X86-NEXT:    vptestmd %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestw %k0, %k1 # encoding: [0xc5,0xf8,0x99,0xc8]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestz_w:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmd %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc0]
+; X64-NEXT:    vptestmd %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0x75,0x48,0x27,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestw %k1, %k0 # encoding: [0xc5,0xf8,0x99,0xc1]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <16 x i32> %A, zeroinitializer
   %2 = icmp ne <16 x i32> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestz.w(<16 x i1> %1, <16 x i1> %2) ; <i32> [#uses=1]
@@ -83,15 +125,25 @@ define i32 @test_x86_avx512_ktestz_w(<16 x i32> %A, <16 x i32> %B) {
 declare i32 @llvm.x86.avx512.ktestz.w(<16 x i1>, <16 x i1>) nounwind readnone
 
 define i32 @test_x86_avx512_ktestc_b(<8 x i64> %A, <8 x i64> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestc_b:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestb %k1, %k0 # encoding: [0xc5,0xf9,0x99,0xc1]
-; CHECK-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestc_b:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmq %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc1]
+; X86-NEXT:    vptestmq %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestb %k0, %k1 # encoding: [0xc5,0xf9,0x99,0xc8]
+; X86-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestc_b:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
+; X64-NEXT:    vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestb %k1, %k0 # encoding: [0xc5,0xf9,0x99,0xc1]
+; X64-NEXT:    setb %al # encoding: [0x0f,0x92,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <8 x i64> %A, zeroinitializer
   %2 = icmp ne <8 x i64> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestc.b(<8 x i1> %1, <8 x i1> %2) ; <i32> [#uses=1]
@@ -100,15 +152,25 @@ define i32 @test_x86_avx512_ktestc_b(<8 x i64> %A, <8 x i64> %B) {
 declare i32 @llvm.x86.avx512.ktestc.b(<8 x i1>, <8 x i1>) nounwind readnone
 
 define i32 @test_x86_avx512_ktestz_b(<8 x i64> %A, <8 x i64> %B) {
-; CHECK-LABEL: test_x86_avx512_ktestz_b:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
-; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    ktestb %k1, %k0 # encoding: [0xc5,0xf9,0x99,0xc1]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_x86_avx512_ktestz_b:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmq %zmm1, %zmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc1]
+; X86-NEXT:    vptestmq %zmm0, %zmm0, %k1 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc8]
+; X86-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X86-NEXT:    ktestb %k0, %k1 # encoding: [0xc5,0xf9,0x99,0xc8]
+; X86-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_ktestz_b:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmq %zmm0, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc0]
+; X64-NEXT:    vptestmq %zmm1, %zmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x48,0x27,0xc9]
+; X64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; X64-NEXT:    ktestb %k1, %k0 # encoding: [0xc5,0xf9,0x99,0xc1]
+; X64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = icmp ne <8 x i64> %A, zeroinitializer
   %2 = icmp ne <8 x i64> %B, zeroinitializer
   %res = call i32 @llvm.x86.avx512.ktestz.b(<8 x i1> %1, <8 x i1> %2) ; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 6f3be88d7cd0c..f6a39c9135122 100644
--- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -64,8 +64,8 @@ define <4 x float> @test_mask_andnot_ps_rm_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_ps_rmk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandnps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x55,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -84,8 +84,8 @@ define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x
 define <4 x float> @test_mask_andnot_ps_rmkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_ps_rmkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandnps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x55,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -120,8 +120,8 @@ define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_ps_rmbk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandnps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x55,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -142,8 +142,8 @@ define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4
 define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_ps_rmbkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandnps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x55,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -541,8 +541,8 @@ define <4 x float> @test_mask_and_ps_rm_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_and_ps_rmk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x54,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -561,8 +561,8 @@ define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x fl
 define <4 x float> @test_mask_and_ps_rmkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_and_ps_rmkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x54,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -597,8 +597,8 @@ define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_and_ps_rmbk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x54,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -619,8 +619,8 @@ define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x f
 define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_and_ps_rmbkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vandps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x54,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1018,8 +1018,8 @@ define <4 x float> @test_mask_or_ps_rm_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_or_ps_rmk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vorps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x56,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1038,8 +1038,8 @@ define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x flo
 define <4 x float> @test_mask_or_ps_rmkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_or_ps_rmkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vorps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x56,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1074,8 +1074,8 @@ define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_or_ps_rmbk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vorps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x56,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1096,8 +1096,8 @@ define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x fl
 define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_or_ps_rmbkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vorps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x56,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1495,8 +1495,8 @@ define <4 x float> @test_mask_xor_ps_rm_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_xor_ps_rmk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vxorps (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x57,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1515,8 +1515,8 @@ define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, ptr %ptr_b, <4 x fl
 define <4 x float> @test_mask_xor_ps_rmkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_xor_ps_rmkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vxorps (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x89,0x57,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1551,8 +1551,8 @@ define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, ptr %ptr_b) {
 define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_xor_ps_rmbk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vxorps (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x19,0x57,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1573,8 +1573,8 @@ define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, ptr %ptr_b, <4 x f
 define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_xor_ps_rmbkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vxorps (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0x99,0x57,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2130,8 +2130,8 @@ define <4 x i64> @test_mask_mullo_epi64_rm_256(<4 x i64> %a, ptr %ptr_b) {
 define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, ptr %ptr_b, <4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmk_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x40,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2150,8 +2150,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, ptr %ptr_b, <4 x i
 define <4 x i64> @test_mask_mullo_epi64_rmkz_256(<4 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmkz_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2186,8 +2186,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, ptr %ptr_b) {
 define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, ptr %ptr_b, <4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmbk_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x39,0x40,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2208,8 +2208,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, ptr %ptr_b, <4 x
 define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2289,8 +2289,8 @@ define <2 x i64> @test_mask_mullo_epi64_rm_128(<2 x i64> %a, ptr %ptr_b) {
 define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, ptr %ptr_b, <2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x40,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2309,8 +2309,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, ptr %ptr_b, <2 x i
 define <2 x i64> @test_mask_mullo_epi64_rmkz_128(<2 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x40,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2345,8 +2345,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, ptr %ptr_b) {
 define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, ptr %ptr_b, <2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmbk_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x40,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2367,8 +2367,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, ptr %ptr_b, <2 x
 define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmullq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x40,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2654,8 +2654,8 @@ define <4 x double>@test_int_x86_avx512_maskz_broadcastf64x2_256(<2 x double> %x
 define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(ptr %x0ptr, <4 x double> %x2, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vbroadcastf64x2 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x1a,0x00]
 ; X86-NEXT:    # ymm0 {%k1} = mem[0,1,0,1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2728,8 +2728,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_broadcasti64x2_256(<2 x i64> %x0, i8
 define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(ptr %x0ptr, <4 x i64> %x2, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vbroadcasti64x2 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x5a,0x00]
 ; X86-NEXT:    # ymm0 {%k1} = mem[0,1,0,1]
 ; X86-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index ec94b593148df..a8f36bfc5b889 100644
--- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -167,8 +167,8 @@ define <2 x i64> @test_int_x86_avx512_cvt_ps2qq_128_load(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2qq_128_load(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -186,8 +186,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2qq_128_load(ptr %p, <2 x i64>
 define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2qq_128_load(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvt_ps2qq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -223,8 +223,8 @@ define <2 x i64> @test_int_x86_avx512_cvt_ps2qq_128_load_2(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2qq_128_load_2(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -242,8 +242,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2qq_128_load_2(ptr %p, <2 x i64
 define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2qq_128_load_2(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvt_ps2qq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -277,8 +277,8 @@ define <2 x i64> @test_int_x86_avx512_cvt_ps2qq_128_load_3(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2qq_128_load_3(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -295,8 +295,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2qq_128_load_3(ptr %p, <2 x i64
 define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2qq_128_load_3(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvt_ps2qq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -388,8 +388,8 @@ define <2 x i64> @test_int_x86_avx512_cvt_ps2uqq_128_load(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2uqq_128_load(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -407,8 +407,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2uqq_128_load(ptr %p, <2 x i64>
 define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2uqq_128_load(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvt_ps2uqq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x79,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -443,8 +443,8 @@ define <2 x i64> @test_int_x86_avx512_cvt_ps2uqq_128_load_2(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2uqq_128_load_2(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -462,8 +462,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2uqq_128_load_2(ptr %p, <2 x i6
 define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2uqq_128_load_2(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvt_ps2uqq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x79,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -497,8 +497,8 @@ define <2 x i64> @test_int_x86_avx512_cvt_ps2uqq_128_load_3(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2uqq_128_load_3(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -515,8 +515,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvt_ps2uqq_128_load_3(ptr %p, <2 x i6
 define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2uqq_128_load_3(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvt_ps2uqq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvtps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x79,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -816,8 +816,8 @@ define <2 x i64> @test_int_x86_avx512_cvtt_ps2qq_128_load(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -835,8 +835,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load(ptr %p, <2 x i64>
 define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -872,8 +872,8 @@ define <2 x i64> @test_int_x86_avx512_cvtt_ps2qq_128_load_2(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load_2(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -891,8 +891,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load_2(ptr %p, <2 x i6
 define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_2(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -926,8 +926,8 @@ define <2 x i64> @test_int_x86_avx512_cvtt_ps2qq_128_load_3(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load_3(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -944,8 +944,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load_3(ptr %p, <2 x i6
 define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1130,8 +1130,8 @@ define <2 x i64> @test_int_x86_avx512_cvtt_ps2uqq_128_load(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1149,8 +1149,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load(ptr %p, <2 x i64
 define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1186,8 +1186,8 @@ define <2 x i64> @test_int_x86_avx512_cvtt_ps2uqq_128_load_2(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_2(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1205,8 +1205,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_2(ptr %p, <2 x i
 define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_2(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1240,8 +1240,8 @@ define <2 x i64> @test_int_x86_avx512_cvtt_ps2uqq_128_load_3(ptr %p) {
 define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_3(ptr %p, <2 x i64> %passthru, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1258,8 +1258,8 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_3(ptr %p, <2 x i
 define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3(ptr %p, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcvttps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index cc58bc1e44f37..e63e3569e92c4 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -817,22 +817,18 @@ define i128 @half_to_s128(half %x) {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixhfti
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
@@ -920,22 +916,18 @@ define i128 @half_to_u128(half %x) {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunshfti
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
@@ -1000,10 +992,8 @@ define fp128 @half_to_f128(half %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
@@ -1011,11 +1001,10 @@ define fp128 @half_to_f128(half %x) nounwind {
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __extendsftf2
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %a = fpext half %x to fp128
@@ -1079,8 +1068,8 @@ define void @f16tou32(ptr %ptr) {
 ;
 ; X86-LABEL: f16tou32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vcvttph2udq {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-NEXT:    retl
   %1 = fptoui <2 x half> splat (half 0xH7E00) to <2 x i32>
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
index 3d4fa9e2cc6fa..5816e98a87c2f 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll
@@ -468,9 +468,9 @@ define <32 x half> @test_x86_fma_vfnmsub_ph_512(<32 x half> %a0, <32 x half> %a1
 define <8 x half>@test_int_x86_avx512_mask3_vfmadd_sh(<8 x half> %x0, <8 x half> %x1, ptr%ptr_b, i8 %x3, i32 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_sh:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vfmadd231sh (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf6,0x7d,0x09,0xb9,0x08]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -531,16 +531,16 @@ define void @fmadd_sh_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-LABEL: fmadd_sh_mask_memfold:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x02]
+; X86-NEXT:    # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x09]
 ; X86-NEXT:    vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8]
-; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1]
-; X86-NEXT:    vmovsh %xmm0, (%edx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x02]
+; X86-NEXT:    vmovsh %xmm0, (%eax) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: fmadd_sh_mask_memfold:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 5e6c11e7fa100..18355381e3828 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1119,9 +1119,9 @@ define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
 ;
 ; X86-LABEL: load8f16mask:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
 ; X86-NEXT:    retl
 ;
@@ -1149,9 +1149,9 @@ define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
 ;
 ; X86-LABEL: load8f16maskz:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
@@ -1193,9 +1193,9 @@ define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
 ;
 ; X86-LABEL: loadu8f16mask:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
 ; X86-NEXT:    retl
 ;
@@ -1223,9 +1223,9 @@ define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
 ;
 ; X86-LABEL: loadu8f16maskz:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    kmovd %ecx, %k1
 ; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
@@ -1470,9 +1470,9 @@ define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
 ;
 ; X86-LABEL: movsh:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; X86-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; X86-NEXT:    vaddph %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; X86-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; X86-NEXT:    vaddph %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-NOVL-LABEL: movsh:
@@ -1655,9 +1655,9 @@ define half @extract_f16_8(<32 x half> %x, i64 %idx) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    vmovaps %zmm0, (%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    andl $31, %eax
-; X86-NEXT:    vmovaps %zmm0, (%esp)
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -1689,10 +1689,10 @@ define half @extract_f16_9(<64 x half> %x, i64 %idx) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    andl $63, %eax
 ; X86-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovaps %zmm0, (%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    andl $63, %eax
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -1807,8 +1807,8 @@ define void @extract_store_f16_1(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 1
@@ -1825,8 +1825,8 @@ define void @extract_store_f16_2(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 2
@@ -1843,8 +1843,8 @@ define void @extract_store_f16_3(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 3
@@ -1861,8 +1861,8 @@ define void @extract_store_f16_4(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 4
@@ -1879,8 +1879,8 @@ define void @extract_store_f16_5(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_5:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 5
@@ -1897,8 +1897,8 @@ define void @extract_store_f16_6(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_6:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 6
@@ -1915,8 +1915,8 @@ define void @extract_store_f16_7(<8 x half> %x, ptr %y) {
 ;
 ; X86-LABEL: extract_store_f16_7:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
    %res = extractelement <8 x half> %x, i32 7
@@ -2084,11 +2084,11 @@ define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3)
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; X86-NEXT:    retl
   %a = insertelement <8 x half> undef, half %a0, i32 0
   %b = insertelement <8 x half> %a, half %a1, i32 1
@@ -2110,11 +2110,11 @@ define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3)
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X86-NEXT:    retl
   %a = insertelement <8 x half> undef, half %a0, i32 4
@@ -2140,19 +2140,19 @@ define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3,
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X86-NEXT:    retl
   %a = insertelement <8 x half> undef, half %a0, i32 0
   %b = insertelement <8 x half> %a, half %a1, i32 1
@@ -2182,20 +2182,20 @@ define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2,
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
-; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT:    vpbroadcastq %xmm1, %xmm1
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
   %a = insertelement <16 x half> undef, half %a0, i32 0
   %b = insertelement <16 x half> %a, half %a1, i32 1
@@ -2273,11 +2273,11 @@ define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
 ; X86-LABEL: load_store_v4f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -2320,7 +2320,7 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; X86-NEXT:    retl
@@ -2436,12 +2436,12 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vpaddd 8(%ebp), %ymm1, %ymm1
-; X86-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
-; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [112,112,112,112,112,112,112,112]
+; X86-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
+; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
 ; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; X86-NEXT:    movl %ebp, %esp
@@ -2472,11 +2472,11 @@ define <8 x i16> @pr59628_xmm(i16 %arg) {
 ; X86-LABEL: pr59628_xmm:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X86-NEXT:    vpbroadcastw %eax, %xmm1
-; X86-NEXT:    vmovsh {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; X86-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
-; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X86-NEXT:    vpbroadcastw %eax, %xmm0
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovsh {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; X86-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %k1
+; X86-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
 ; X64-NOVL-LABEL: pr59628_xmm:
diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
index 41ad4662f5ee0..d68d81e9d6afd 100644
--- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
@@ -177,9 +177,9 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -197,9 +197,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -219,9 +219,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64>
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -239,9 +239,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -261,9 +261,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -281,9 +281,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0,
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -303,9 +303,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64>
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -323,9 +323,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i6
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 4ef63cfb6abb8..90d603a32498f 100644
--- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -181,9 +181,9 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -203,9 +203,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -227,9 +227,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64>
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -249,9 +249,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -273,9 +273,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -295,9 +295,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0,
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, ptr %x2ptr, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -319,9 +319,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64>
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax), %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -341,9 +341,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i6
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, ptr %x1ptr, <8 x i64> %x2, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll
index d7a97408652c0..7ab9f7fbce3d3 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll
@@ -199,8 +199,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8>
 define void @test_mask_compress_store_w_512(ptr %addr, <32 x i16> %data, i32 %mask) {
 ; X86-LABEL: test_mask_compress_store_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -220,8 +220,8 @@ declare void @llvm.x86.avx512.mask.compress.store.w.512(ptr %addr, <32 x i16> %d
 define void @test_compress_store_w_512(ptr %addr, <32 x i16> %data) {
 ; X86-LABEL: test_compress_store_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnord %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -304,8 +304,8 @@ declare void @llvm.x86.avx512.mask.compress.store.b.512(ptr %addr, <64 x i8> %da
 define void @test_compress_store_b_512(ptr %addr, <64 x i8> %data) {
 ; X86-LABEL: test_compress_store_b_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorq %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -723,9 +723,9 @@ define { <8 x i64>, <8 x i64> }@test_int_x86_avx512_mask_vpshrdv_q_512(<8 x i64>
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_q_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x73,0x00]
 ; X86-NEXT:    vpshrdvq %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x73,0xda]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
@@ -841,9 +841,9 @@ define { <8 x i64>, <8 x i64> }@test_int_x86_avx512_mask_vpshldv_q_512(<8 x i64>
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_q_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x71,0x00]
 ; X86-NEXT:    vpshldvq %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x71,0xda]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
index a28f2aa5fc93e..caf3133d823e9 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
@@ -199,8 +199,8 @@ define <64 x i8> @test_maskz_expand_b_512(<64 x i8> %data, i64 %mask) {
 define void @test_mask_compress_store_w_512(ptr %addr, <32 x i16> %data, i32 %mask) {
 ; X86-LABEL: test_mask_compress_store_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -219,8 +219,8 @@ define void @test_mask_compress_store_w_512(ptr %addr, <32 x i16> %data, i32 %ma
 define void @test_compress_store_w_512(ptr %addr, <32 x i16> %data) {
 ; X86-LABEL: test_compress_store_w_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnord %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -302,8 +302,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mas
 define void @test_compress_store_b_512(ptr %addr, <64 x i8> %data) {
 ; X86-LABEL: test_compress_store_b_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorq %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -549,9 +549,9 @@ define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_vpshrdv_q_512(<8 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_q_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x73,0x00]
 ; X86-NEXT:    vpshrdvq %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x73,0xda]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
@@ -643,9 +643,9 @@ define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_vpshldv_q_512(<8 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_q_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvq (%eax), %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x71,0x00]
 ; X86-NEXT:    vpshldvq %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x71,0xda]
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
index 1bcac8ff553d1..0342ee373d2a8 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll
@@ -5,9 +5,9 @@
 define <8 x i16> @test_mask_expand_load_w_128(ptr %addr, <8 x i16> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x62,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -23,9 +23,9 @@ define <8 x i16> @test_mask_expand_load_w_128(ptr %addr, <8 x i16> %data, i8 %ma
 define <8 x i16> @test_maskz_expand_load_w_128(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x62,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -43,8 +43,8 @@ declare <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(ptr %addr, <8 x i16> %
 define <8 x i16> @test_expand_load_w_128(ptr %addr, <8 x i16> %data) {
 ; X86-LABEL: test_expand_load_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x62,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -203,9 +203,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8>
 define void @test_mask_compress_store_w_128(ptr %addr, <8 x i16> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcompressw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -223,8 +223,8 @@ declare void @llvm.x86.avx512.mask.compress.store.w.128(ptr %addr, <8 x i16> %da
 define void @test_compress_store_w_128(ptr %addr, <8 x i16> %data) {
 ; X86-LABEL: test_compress_store_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -286,8 +286,8 @@ declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %data, <8 x i16
 define void @test_mask_compress_store_b_128(ptr %addr, <16 x i8> %data, i16 %mask) {
 ; X86-LABEL: test_mask_compress_store_b_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -305,8 +305,8 @@ declare void @llvm.x86.avx512.mask.compress.store.b.128(ptr %addr, <16 x i8> %da
 define void @test_compress_store_b_128(ptr %addr, <16 x i8> %data) {
 ; X86-LABEL: test_compress_store_b_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -560,8 +560,8 @@ declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %data, <32 x i8>
 define void @test_mask_compress_store_w_256(ptr %addr, <16 x i16> %data, i16 %mask) {
 ; X86-LABEL: test_mask_compress_store_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -581,8 +581,8 @@ declare void @llvm.x86.avx512.mask.compress.store.w.256(ptr %addr, <16 x i16> %d
 define void @test_compress_store_w_256(ptr %addr, <16 x i16> %data) {
 ; X86-LABEL: test_compress_store_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -644,8 +644,8 @@ declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x
 define void @test_mask_compress_store_b_256(ptr %addr, <32 x i8> %data, i32 %mask) {
 ; X86-LABEL: test_mask_compress_store_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -665,8 +665,8 @@ declare void @llvm.x86.avx512.mask.compress.store.b.256(ptr %addr, <32 x i8> %da
 define void @test_compress_store_b_256(ptr %addr, <32 x i8> %data) {
 ; X86-LABEL: test_compress_store_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnord %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -728,14 +728,13 @@ declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd9,0x16]
 ; X86-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
@@ -761,14 +760,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32,
 define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshld_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x17]
+; X86-NEXT:    vpshldd $22, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xd9,0x16]
 ; X86-NEXT:    vpshldd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x71,0xd1,0x18]
-; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
+; X86-NEXT:    vpshldd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc9,0x17]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
@@ -794,14 +792,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32,
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshld_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x17]
+; X86-NEXT:    vpshldq $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xd9,0x16]
 ; X86-NEXT:    vpshldq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x71,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshldq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
@@ -827,14 +824,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32,
 define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshld_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x17]
+; X86-NEXT:    vpshldq $22, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xd9,0x16]
 ; X86-NEXT:    vpshldq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x71,0xd1,0x18]
-; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
+; X86-NEXT:    vpshldq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc9,0x17]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
@@ -860,14 +856,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32,
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshld_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xe1,0x06]
-; X86-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x07]
+; X86-NEXT:    vpshldw $6, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xd9,0x06]
 ; X86-NEXT:    vpshldw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x70,0xd1,0x08]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc9,0x07]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
@@ -893,13 +888,12 @@ declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32,
 define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshld_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vpshldw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xe1,0x06]
-; X86-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x07]
+; X86-NEXT:    vpshldw $6, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xd9,0x06]
 ; X86-NEXT:    vpshldw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x70,0xd1,0x08]
-; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
+; X86-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc9,0x07]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
@@ -925,14 +919,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i3
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd9,0x16]
 ; X86-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
@@ -958,14 +951,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32,
 define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrd_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x17]
+; X86-NEXT:    vpshrdd $22, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xd9,0x16]
 ; X86-NEXT:    vpshrdd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x73,0xd1,0x18]
-; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
+; X86-NEXT:    vpshrdd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc9,0x17]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
@@ -991,14 +983,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32,
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x17]
+; X86-NEXT:    vpshrdq $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xd9,0x16]
 ; X86-NEXT:    vpshrdq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x73,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshrdq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
@@ -1024,14 +1015,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32,
 define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x17]
+; X86-NEXT:    vpshrdq $22, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xd9,0x16]
 ; X86-NEXT:    vpshrdq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x73,0xd1,0x18]
-; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
+; X86-NEXT:    vpshrdq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc9,0x17]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
@@ -1057,14 +1047,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32,
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrd_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xe1,0x06]
-; X86-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x07]
+; X86-NEXT:    vpshrdw $6, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xd9,0x06]
 ; X86-NEXT:    vpshrdw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x72,0xd1,0x08]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc9,0x07]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
@@ -1090,13 +1079,12 @@ declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32,
 define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshrd_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %ymm2, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xda]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vpshrdw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xe1,0x06]
-; X86-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x07]
+; X86-NEXT:    vpshrdw $6, %ymm1, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xd9,0x06]
 ; X86-NEXT:    vpshrdw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x72,0xd1,0x08]
-; X86-NEXT:    vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
+; X86-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc9,0x07]
+; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
@@ -1122,14 +1110,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i3
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_128_2(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd9,0x16]
 ; X86-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128_2:
@@ -1297,14 +1284,13 @@ declare <16 x i16> @llvm.x86.avx512.vpshld.w.256(<16 x i16>, <16 x i16>, i32)
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_128_2(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd9,0x16]
 ; X86-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128_2:
@@ -1485,9 +1471,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrdv_d_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_d_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x73,0x00]
 ; X86-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x73,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1525,9 +1511,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrdv_d_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_d_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x73,0x00]
 ; X86-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x73,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1565,9 +1551,9 @@ define { <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrdv_q_256(<4 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_q_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvq (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x73,0x00]
 ; X86-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x73,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1605,9 +1591,9 @@ define { <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrdv_q_128(<2 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_q_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvq (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x73,0x00]
 ; X86-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x73,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1684,9 +1670,9 @@ define { <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrdv_w_128(<8 x i16
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_w_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvw (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x72,0x00]
 ; X86-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x72,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1724,9 +1710,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshldv_d_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_d_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x71,0x00]
 ; X86-NEXT:    vpshldvd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x71,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1764,9 +1750,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshldv_d_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_d_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x71,0x00]
 ; X86-NEXT:    vpshldvd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x71,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1804,9 +1790,9 @@ define { <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshldv_q_256(<4 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_q_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvq (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x71,0x00]
 ; X86-NEXT:    vpshldvq %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x71,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1844,9 +1830,9 @@ define { <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshldv_q_128(<2 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_q_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvq (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x71,0x00]
 ; X86-NEXT:    vpshldvq %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x71,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1923,9 +1909,9 @@ define { <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshldv_w_128(<8 x i16
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_w_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvw (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x70,0x00]
 ; X86-NEXT:    vpshldvw %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x70,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
index 9223885730d04..bd218e54389e2 100644
--- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
@@ -5,9 +5,9 @@
 define <8 x i16> @test_mask_expand_load_w_128(ptr %addr, <8 x i16> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x62,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -24,9 +24,9 @@ define <8 x i16> @test_mask_expand_load_w_128(ptr %addr, <8 x i16> %data, i8 %ma
 define <8 x i16> @test_maskz_expand_load_w_128(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x62,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -43,8 +43,8 @@ define <8 x i16> @test_maskz_expand_load_w_128(ptr %addr, i8 %mask) {
 define <8 x i16> @test_expand_load_w_128(ptr %addr, <8 x i16> %data) {
 ; X86-LABEL: test_expand_load_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x62,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -203,9 +203,9 @@ define <16 x i8> @test_maskz_expand_b_128(<16 x i8> %data, i16 %mask) {
 define void @test_mask_compress_store_w_128(ptr %addr, <8 x i16> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_w_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcompressw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -222,8 +222,8 @@ define void @test_mask_compress_store_w_128(ptr %addr, <8 x i16> %data, i8 %mask
 define void @test_compress_store_w_128(ptr %addr, <8 x i16> %data) {
 ; X86-LABEL: test_compress_store_w_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -285,8 +285,8 @@ define <8 x i16> @test_compress_w_128(<8 x i16> %data) {
 define void @test_mask_compress_store_b_128(ptr %addr, <16 x i8> %data, i16 %mask) {
 ; X86-LABEL: test_mask_compress_store_b_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -303,8 +303,8 @@ define void @test_mask_compress_store_b_128(ptr %addr, <16 x i8> %data, i16 %mas
 define void @test_compress_store_b_128(ptr %addr, <16 x i8> %data) {
 ; X86-LABEL: test_compress_store_b_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x63,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -558,8 +558,8 @@ define <32 x i8> @test_maskz_expand_b_256(<32 x i8> %data, i32 %mask) {
 define void @test_mask_compress_store_w_256(ptr %addr, <16 x i16> %data, i16 %mask) {
 ; X86-LABEL: test_mask_compress_store_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -578,8 +578,8 @@ define void @test_mask_compress_store_w_256(ptr %addr, <16 x i16> %data, i16 %ma
 define void @test_compress_store_w_256(ptr %addr, <16 x i16> %data) {
 ; X86-LABEL: test_compress_store_w_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -641,8 +641,8 @@ define <16 x i16> @test_compress_w_256(<16 x i16> %data) {
 define void @test_mask_compress_store_b_256(ptr %addr, <32 x i8> %data, i32 %mask) {
 ; X86-LABEL: test_mask_compress_store_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -661,8 +661,8 @@ define void @test_mask_compress_store_b_256(ptr %addr, <32 x i8> %data, i32 %mas
 define void @test_compress_store_b_256(ptr %addr, <32 x i8> %data) {
 ; X86-LABEL: test_compress_store_b_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnord %k0, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x63,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -724,14 +724,13 @@ define <32 x i8> @test_compress_b_256(<32 x i8> %data) {
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16]
-; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17]
+; X86-NEXT:    vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xd9,0x16]
 ; X86-NEXT:    vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
@@ -893,14 +892,13 @@ define { <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshld_w_256(<16 x i
 define { <4 x i32>, <4 x i32>,  <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2]
+; X86-NEXT:    vmovdqa %xmm2, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xda]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
-; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16]
-; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17]
+; X86-NEXT:    vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xd9,0x16]
 ; X86-NEXT:    vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18]
-; X86-NEXT:    vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4]
-; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
+; X86-NEXT:    vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17]
+; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
@@ -1063,9 +1061,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrdv_d_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_d_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x73,0x00]
 ; X86-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x73,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1095,9 +1093,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrdv_d_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_d_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x73,0x00]
 ; X86-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x73,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1129,9 +1127,9 @@ define { <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrdv_q_256(<4 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_q_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvq (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x73,0x00]
 ; X86-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x73,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1163,9 +1161,9 @@ define { <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrdv_q_128(<2 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_q_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvq (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x73,0x00]
 ; X86-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x73,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1228,9 +1226,9 @@ define { <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrdv_w_128(<8 x i16
 ; X86-LABEL: test_int_x86_avx512_mask_vpshrdv_w_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshrdvw (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x72,0x00]
 ; X86-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x72,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1260,9 +1258,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshldv_d_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_d_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x71,0x00]
 ; X86-NEXT:    vpshldvd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x71,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1292,9 +1290,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshldv_d_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_d_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x71,0x00]
 ; X86-NEXT:    vpshldvd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x71,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1326,9 +1324,9 @@ define { <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshldv_q_256(<4 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_q_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvq (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x71,0x00]
 ; X86-NEXT:    vpshldvq %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x71,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -1360,9 +1358,9 @@ define { <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshldv_q_128(<2 x i64
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_q_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvq (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x71,0x00]
 ; X86-NEXT:    vpshldvq %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x71,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -1425,9 +1423,9 @@ define { <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshldv_w_128(<8 x i16
 ; X86-LABEL: test_int_x86_avx512_mask_vpshldv_w_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpshldvw (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x70,0x00]
 ; X86-NEXT:    vpshldvw %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x70,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index c8504ed7151fb..b538761f330eb 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -7,9 +7,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8)
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x5c,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x5c,0x24,0x04]
 ; X86-NEXT:    vpblendmd %xmm3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x64,0xcb]
 ; X86-NEXT:    vmovdqa32 %xmm3, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
@@ -38,9 +38,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8)
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x5c,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x5c,0x24,0x04]
 ; X86-NEXT:    vpblendmq %xmm3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x64,0xcb]
 ; X86-NEXT:    vmovdqa64 %xmm3, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
@@ -69,9 +69,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8)
 define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x5c,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x5c,0x24,0x04]
 ; X86-NEXT:    vpblendmd %ymm3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x64,0xcb]
 ; X86-NEXT:    vmovdqa32 %ymm3, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
@@ -99,9 +99,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8)
 define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x5c,0x24,0x04]
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x5c,0x24,0x04]
 ; X86-NEXT:    vpblendmq %ymm3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x64,0xcb]
 ; X86-NEXT:    vmovdqa64 %ymm3, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0xd3]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
@@ -1099,11 +1099,11 @@ declare void @llvm.x86.avx512.mask.store.pd.128(ptr, <2 x double>, i8)
 define void at test_int_x86_avx512_mask_store_pd_128(ptr %ptr1, ptr %ptr2, <2 x double> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_pd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovapd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x29,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovapd %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x29,0x01]
 ; X86-NEXT:    vmovapd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1123,11 +1123,11 @@ declare void @llvm.x86.avx512.mask.store.pd.256(ptr, <4 x double>, i8)
 define void at test_int_x86_avx512_mask_store_pd_256(ptr %ptr1, ptr %ptr2, <4 x double> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_pd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovapd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x29,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovapd %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x29,0x01]
 ; X86-NEXT:    vmovapd %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x29,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1149,11 +1149,11 @@ declare void @llvm.x86.avx512.mask.storeu.pd.128(ptr, <2 x double>, i8)
 define void at test_int_x86_avx512_mask_storeu_pd_128(ptr %ptr1, ptr %ptr2, <2 x double> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_pd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovupd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x11,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovupd %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x11,0x01]
 ; X86-NEXT:    vmovupd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1173,11 +1173,11 @@ declare void @llvm.x86.avx512.mask.storeu.pd.256(ptr, <4 x double>, i8)
 define void at test_int_x86_avx512_mask_storeu_pd_256(ptr %ptr1, ptr %ptr2, <4 x double> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_pd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovupd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x11,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovupd %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x11,0x01]
 ; X86-NEXT:    vmovupd %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x11,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1199,11 +1199,11 @@ declare void @llvm.x86.avx512.mask.store.ps.128(ptr, <4 x float>, i8)
 define void at test_int_x86_avx512_mask_store_ps_128(ptr %ptr1, ptr %ptr2, <4 x float> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_ps_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x29,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovaps %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x29,0x01]
 ; X86-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1223,11 +1223,11 @@ declare void @llvm.x86.avx512.mask.store.ps.256(ptr, <8 x float>, i8)
 define void at test_int_x86_avx512_mask_store_ps_256(ptr %ptr1, ptr %ptr2, <8 x float> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovaps %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x01]
 ; X86-NEXT:    vmovaps %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1249,11 +1249,11 @@ declare void @llvm.x86.avx512.mask.storeu.ps.128(ptr, <4 x float>, i8)
 define void at test_int_x86_avx512_mask_storeu_ps_128(ptr %ptr1, ptr %ptr2, <4 x float> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_ps_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovups %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x11,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovups %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x11,0x01]
 ; X86-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1273,11 +1273,11 @@ declare void @llvm.x86.avx512.mask.storeu.ps.256(ptr, <8 x float>, i8)
 define void at test_int_x86_avx512_mask_storeu_ps_256(ptr %ptr1, ptr %ptr2, <8 x float> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovups %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x11,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovups %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x11,0x01]
 ; X86-NEXT:    vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1299,11 +1299,11 @@ declare void @llvm.x86.avx512.mask.storeu.q.128(ptr, <2 x i64>, i8)
 define void at test_int_x86_avx512_mask_storeu_q_128(ptr %ptr1, ptr %ptr2, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu64 %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7f,0x01]
 ; X86-NEXT:    vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1323,11 +1323,11 @@ declare void @llvm.x86.avx512.mask.storeu.q.256(ptr, <4 x i64>, i8)
 define void at test_int_x86_avx512_mask_storeu_q_256(ptr %ptr1, ptr %ptr2, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu64 %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu64 %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7f,0x01]
 ; X86-NEXT:    vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1349,11 +1349,11 @@ declare void @llvm.x86.avx512.mask.storeu.d.128(ptr, <4 x i32>, i8)
 define void at test_int_x86_avx512_mask_storeu_d_128(ptr %ptr1, ptr %ptr2, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu32 %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu32 %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7f,0x01]
 ; X86-NEXT:    vmovdqu %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1373,11 +1373,11 @@ declare void @llvm.x86.avx512.mask.storeu.d.256(ptr, <8 x i32>, i8)
 define void at test_int_x86_avx512_mask_storeu_d_256(ptr %ptr1, ptr %ptr2, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu32 %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu32 %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7f,0x01]
 ; X86-NEXT:    vmovdqu %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1399,11 +1399,11 @@ declare void @llvm.x86.avx512.mask.store.q.128(ptr, <2 x i64>, i8)
 define void at test_int_x86_avx512_mask_store_q_128(ptr %ptr1, ptr %ptr2, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqa64 %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7f,0x01]
 ; X86-NEXT:    vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1423,11 +1423,11 @@ declare void @llvm.x86.avx512.mask.store.q.256(ptr, <4 x i64>, i8)
 define void at test_int_x86_avx512_mask_store_q_256(ptr %ptr1, ptr %ptr2, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqa64 %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7f,0x01]
 ; X86-NEXT:    vmovdqa %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1449,11 +1449,11 @@ declare void @llvm.x86.avx512.mask.store.d.128(ptr, <4 x i32>, i8)
 define void at test_int_x86_avx512_mask_store_d_128(ptr %ptr1, ptr %ptr2, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa32 %xmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqa32 %xmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7f,0x01]
 ; X86-NEXT:    vmovdqa %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1473,11 +1473,11 @@ declare void @llvm.x86.avx512.mask.store.d.256(ptr, <8 x i32>, i8)
 define void at test_int_x86_avx512_mask_store_d_256(ptr %ptr1, ptr %ptr2, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_store_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa32 %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7f,0x00]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqa32 %ymm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7f,0x01]
 ; X86-NEXT:    vmovdqa %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1497,10 +1497,10 @@ define void at test_int_x86_avx512_mask_store_d_256(ptr %ptr1, ptr %ptr2, <8 x i32>
 define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, ptr %ptr, i8 %mask) {
 ; X86-LABEL: test_mask_load_aligned_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x00]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vmovaps (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0x00]
 ; X86-NEXT:    vmovaps (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x08]
 ; X86-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
@@ -1526,10 +1526,10 @@ declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(ptr, <8 x float>, i8)
 define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, ptr %ptr, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovups (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x00]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vmovups (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x10,0x00]
 ; X86-NEXT:    vmovups (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x08]
 ; X86-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
@@ -1731,13 +1731,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(ptr, <4 x i32>, i8)
 define <4 x i32> @test_mask_load_unaligned_d_128(ptr %ptr, ptr %ptr2, <4 x i32> %data, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu32 (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x00]
-; X86-NEXT:    vmovdqu32 (%ecx), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x09]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x0c]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqu32 (%ecx), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x01]
+; X86-NEXT:    vmovdqu32 (%eax), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x08]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1761,13 +1761,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(ptr, <8 x i32>, i8)
 define <8 x i32> @test_mask_load_unaligned_d_256(ptr %ptr, ptr %ptr2, <8 x i32> %data, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_d_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x01]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu32 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x00]
-; X86-NEXT:    vmovdqu32 (%ecx), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x09]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqu32 (%ecx), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x01]
+; X86-NEXT:    vmovdqu32 (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x08]
 ; X86-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1791,13 +1791,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(ptr, <2 x i64>, i8)
 define <2 x i64> @test_mask_load_unaligned_q_128(ptr %ptr, ptr %ptr2, <2 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu64 (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x00]
-; X86-NEXT:    vmovdqu64 (%ecx), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x09]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x0c]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqu64 (%ecx), %xmm0 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x01]
+; X86-NEXT:    vmovdqu64 (%eax), %xmm1 {%k1} {z} # encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x08]
 ; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1821,13 +1821,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(ptr, <4 x i64>, i8)
 define <4 x i64> @test_mask_load_unaligned_q_256(ptr %ptr, ptr %ptr2, <4 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_load_unaligned_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-NEXT:    vmovdqu (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x01]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx # encoding: [0x0f,0xb6,0x54,0x24,0x0c]
-; X86-NEXT:    kmovw %edx, %k1 # encoding: [0xc5,0xf8,0x92,0xca]
-; X86-NEXT:    vmovdqu64 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x00]
-; X86-NEXT:    vmovdqu64 (%ecx), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x09]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqu (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x0c]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqu64 (%ecx), %ymm0 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x01]
+; X86-NEXT:    vmovdqu64 (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x08]
 ; X86-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1880,10 +1880,10 @@ declare <8 x i32> @llvm.x86.avx512.mask.load.d.256(ptr, <8 x i32>, i8)
 define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, ptr %ptr, i8 %mask) {
 ; X86-LABEL: test_mask_load_aligned_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovdqa (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x00]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vmovdqa32 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x00]
 ; X86-NEXT:    vmovdqa32 (%eax), %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x08]
 ; X86-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
@@ -2940,9 +2940,9 @@ define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2961,9 +2961,9 @@ define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32
 define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2998,9 +2998,9 @@ define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3021,9 +3021,9 @@ define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i3
 define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3105,9 +3105,9 @@ define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3126,9 +3126,9 @@ define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32
 define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3163,9 +3163,9 @@ define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3186,9 +3186,9 @@ define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i3
 define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_and_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandd (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3270,9 +3270,9 @@ define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3291,9 +3291,9 @@ define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32>
 define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3328,9 +3328,9 @@ define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3351,9 +3351,9 @@ define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32
 define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3435,9 +3435,9 @@ define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3456,9 +3456,9 @@ define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32>
 define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3493,9 +3493,9 @@ define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3516,9 +3516,9 @@ define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32
 define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_or_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpord (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3600,9 +3600,9 @@ define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xef,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3621,9 +3621,9 @@ define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32
 define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xef,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3658,9 +3658,9 @@ define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xef,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3681,9 +3681,9 @@ define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i3
 define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xef,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3765,9 +3765,9 @@ define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xef,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3786,9 +3786,9 @@ define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32
 define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3823,9 +3823,9 @@ define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xef,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3846,9 +3846,9 @@ define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i3
 define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_xor_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpxord (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3930,9 +3930,9 @@ define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3951,9 +3951,9 @@ define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x
 define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3988,9 +3988,9 @@ define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4011,9 +4011,9 @@ define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x
 define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4095,9 +4095,9 @@ define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4116,9 +4116,9 @@ define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4153,9 +4153,9 @@ define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4176,9 +4176,9 @@ define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x
 define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnd (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4260,9 +4260,9 @@ define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, ptr %ptr_b) {
 define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, ptr %ptr_b, <2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4281,9 +4281,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, ptr %ptr_b, <2 x
 define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4318,9 +4318,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, ptr %ptr_b) {
 define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, ptr %ptr_b, <2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4341,9 +4341,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, ptr %ptr_b, <2 x
 define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4425,9 +4425,9 @@ define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, ptr %ptr_b) {
 define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, ptr %ptr_b, <4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4446,9 +4446,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, ptr %ptr_b, <4 x
 define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4483,9 +4483,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, ptr %ptr_b) {
 define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, ptr %ptr_b, <4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4506,9 +4506,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, ptr %ptr_b, <4 x
 define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpandnq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4590,9 +4590,9 @@ define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4611,9 +4611,9 @@ define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32
 define <4 x i32> @test_mask_add_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4648,9 +4648,9 @@ define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4671,9 +4671,9 @@ define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i3
 define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4755,9 +4755,9 @@ define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4776,9 +4776,9 @@ define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32
 define <4 x i32> @test_mask_sub_epi32_rmkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4813,9 +4813,9 @@ define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, ptr %ptr_b) {
 define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4836,9 +4836,9 @@ define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, ptr %ptr_b, <4 x i3
 define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4920,9 +4920,9 @@ define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4941,9 +4941,9 @@ define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32
 define <8 x i32> @test_mask_sub_epi32_rmkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4978,9 +4978,9 @@ define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5001,9 +5001,9 @@ define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i3
 define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_sub_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpsubd (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -5085,9 +5085,9 @@ define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5106,9 +5106,9 @@ define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32
 define <8 x i32> @test_mask_add_epi32_rmkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -5143,9 +5143,9 @@ define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, ptr %ptr_b) {
 define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i32> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5166,9 +5166,9 @@ define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, ptr %ptr_b, <8 x i3
 define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_add_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpaddd (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -6855,8 +6855,8 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_psrl_qi_128
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpsrlq $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0xe5,0x09,0x73,0xd0,0x03]
-; X86-NEXT:    vpsrlq $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xd0,0x04]
 ; X86-NEXT:    vpsrlq $5, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xed,0x89,0x73,0xd0,0x05]
+; X86-NEXT:    vpsrlq $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xd0,0x04]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -6887,8 +6887,8 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_psrl_qi_256
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpsrlq $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0xe5,0x29,0x73,0xd0,0x03]
-; X86-NEXT:    vpsrlq $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x73,0xd0,0x04]
 ; X86-NEXT:    vpsrlq $5, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xed,0xa9,0x73,0xd0,0x05]
+; X86-NEXT:    vpsrlq $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x73,0xd0,0x04]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -6919,8 +6919,8 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_psrl_di_128
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpsrld $3, %xmm0, %xmm3 {%k1} # encoding: [0x62,0xf1,0x65,0x09,0x72,0xd0,0x03]
-; X86-NEXT:    vpsrld $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xd0,0x04]
 ; X86-NEXT:    vpsrld $5, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0x89,0x72,0xd0,0x05]
+; X86-NEXT:    vpsrld $4, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xd0,0x04]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -6951,8 +6951,8 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_psrl_di_256
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpsrld $3, %ymm0, %ymm3 {%k1} # encoding: [0x62,0xf1,0x65,0x29,0x72,0xd0,0x03]
-; X86-NEXT:    vpsrld $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x72,0xd0,0x04]
 ; X86-NEXT:    vpsrld $5, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xd0,0x05]
+; X86-NEXT:    vpsrld $4, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x72,0xd0,0x04]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -9492,31 +9492,57 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
 declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x66,0xc0]
-; CHECK-NEXT:    vpcmpled %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x02]
-; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x04]
-; CHECK-NEXT:    vpcmpnltd %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x05]
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k4 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xe1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_d_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x66,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpled %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_d_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
+; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x66,0xc0]
+; X64-NEXT:    vpcmpled %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x02]
+; X64-NEXT:    vpcmpneqd %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x04]
+; X64-NEXT:    vpcmpnltd %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x05]
+; X64-NEXT:    vpcmpgtd %ymm1, %ymm0, %k4 # encoding: [0x62,0xf1,0x7d,0x28,0x66,0xe1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; X64-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
@@ -9541,25 +9567,25 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 ; X86-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x66,0xc0]
-; X86-NEXT:    vpcmpled %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x02]
-; X86-NEXT:    vpcmpneqd %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x04]
-; X86-NEXT:    vpcmpnltd %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x05]
-; X86-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc9]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01]
-; X86-NEXT:    kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
-; X86-NEXT:    kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
-; X86-NEXT:    kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT:    movzbl %dl, %edx # encoding: [0x0f,0xb6,0xd2]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01]
+; X86-NEXT:    vpcmpled %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02]
+; X86-NEXT:    vpcmpneqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04]
+; X86-NEXT:    vpcmpnltd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05]
+; X86-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -9611,31 +9637,57 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    vpcmpltud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleud %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x02]
-; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x04]
-; CHECK-NEXT:    vpcmpnltud %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x05]
-; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x06]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
-; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_d_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_d_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
+; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpcmpltud %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x01]
+; X64-NEXT:    vpcmpleud %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x02]
+; X64-NEXT:    vpcmpneqd %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x04]
+; X64-NEXT:    vpcmpnltud %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x05]
+; X64-NEXT:    vpcmpnleud %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x06]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; X64-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x01]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $255, %eax # encoding: [0xb8,0xff,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
@@ -9660,25 +9712,25 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
 ; X86-NEXT:    vpcmpltud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x01]
-; X86-NEXT:    vpcmpleud %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x02]
-; X86-NEXT:    vpcmpneqd %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x04]
-; X86-NEXT:    vpcmpnltud %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x05]
-; X86-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x06]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
 ; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc2,0x01]
-; X86-NEXT:    kmovw %k2, %ecx # encoding: [0xc5,0xf8,0x93,0xca]
-; X86-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x02]
-; X86-NEXT:    kmovw %k3, %ecx # encoding: [0xc5,0xf8,0x93,0xcb]
-; X86-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x04]
-; X86-NEXT:    kmovw %k4, %ecx # encoding: [0xc5,0xf8,0x93,0xcc]
-; X86-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x05]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x06]
+; X86-NEXT:    movzbl %dl, %edx # encoding: [0x0f,0xb6,0xd2]
+; X86-NEXT:    vmovd %edx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd2]
+; X86-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x01]
+; X86-NEXT:    vpcmpleud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $2, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x02]
+; X86-NEXT:    vpcmpneqd %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $4, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x04]
+; X86-NEXT:    vpcmpnltud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $5, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd1,0x05]
+; X86-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc1,0x06]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -9730,30 +9782,55 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
-; CHECK-NEXT:    vpcmpgtq %ymm0, %ymm1, %k1 # encoding: [0x62,0xf2,0xf5,0x28,0x37,0xc8]
-; CHECK-NEXT:    vpcmpleq %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltq %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x05]
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xe9]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_q_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtq %ymm0, %ymm1, %k0 # encoding: [0x62,0xf2,0xf5,0x28,0x37,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_q_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
+; X64-NEXT:    vpcmpgtq %ymm0, %ymm1, %k1 # encoding: [0x62,0xf2,0xf5,0x28,0x37,0xc8]
+; X64-NEXT:    vpcmpleq %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02]
+; X64-NEXT:    vpcmpneqq %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltq %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x05]
+; X64-NEXT:    vpcmpgtq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf2,0xfd,0x28,0x37,0xe9]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
@@ -9778,27 +9855,27 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpgtq %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x37,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
-; X86-NEXT:    vpcmpgtq %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x37,0xd0]
-; X86-NEXT:    vpcmpleq %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02]
-; X86-NEXT:    vpcmpneqq %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltq %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x05]
-; X86-NEXT:    vpcmpgtq %ymm1, %ymm0, %k6 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x37,0xf1]
-; X86-NEXT:    kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
-; X86-NEXT:    kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
-; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; X86-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; X86-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; X86-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; X86-NEXT:    kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
+; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -9852,30 +9929,55 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_256:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
-; CHECK-NEXT:    vpcmpltuq %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01]
-; CHECK-NEXT:    vpcmpleuq %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x05]
-; CHECK-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x06]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_q_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_q_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
+; X64-NEXT:    vpcmpltuq %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01]
+; X64-NEXT:    vpcmpleuq %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02]
+; X64-NEXT:    vpcmpneqq %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x05]
+; X64-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x06]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
@@ -9900,27 +10002,27 @@ define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpltuq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
-; X86-NEXT:    vpcmpltuq %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01]
-; X86-NEXT:    vpcmpleuq %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02]
-; X86-NEXT:    vpcmpneqq %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x05]
-; X86-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k6 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x06]
-; X86-NEXT:    kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
-; X86-NEXT:    kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
-; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; X86-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; X86-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; X86-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; X86-NEXT:    kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleuq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
+; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -9974,29 +10076,53 @@ define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
-; CHECK-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x66,0xc8]
-; CHECK-NEXT:    vpcmpled %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltd %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x05]
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xe9]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_d_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x66,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpled %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_d_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
+; X64-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x66,0xc8]
+; X64-NEXT:    vpcmpled %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02]
+; X64-NEXT:    vpcmpneqd %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltd %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x05]
+; X64-NEXT:    vpcmpgtd %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x66,0xe9]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
@@ -10021,27 +10147,27 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpgtd %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x66,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
-; X86-NEXT:    vpcmpgtd %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x66,0xd0]
-; X86-NEXT:    vpcmpled %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02]
-; X86-NEXT:    vpcmpneqd %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltd %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x05]
-; X86-NEXT:    vpcmpgtd %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x66,0xf1]
-; X86-NEXT:    kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
-; X86-NEXT:    kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
-; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; X86-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; X86-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; X86-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; X86-NEXT:    kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpled %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
+; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -10093,29 +10219,53 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
-; CHECK-NEXT:    vpcmpltud %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01]
-; CHECK-NEXT:    vpcmpleud %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltud %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x05]
-; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x06]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_d_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_d_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
+; X64-NEXT:    vpcmpltud %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01]
+; X64-NEXT:    vpcmpleud %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02]
+; X64-NEXT:    vpcmpneqd %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltud %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x05]
+; X64-NEXT:    vpcmpnleud %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x06]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $15, %eax # encoding: [0xb8,0x0f,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
@@ -10140,27 +10290,27 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
-; X86-NEXT:    vpcmpltud %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01]
-; X86-NEXT:    vpcmpleud %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02]
-; X86-NEXT:    vpcmpneqd %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltud %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x05]
-; X86-NEXT:    vpcmpnleud %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x06]
-; X86-NEXT:    kshiftlw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
-; X86-NEXT:    kshiftrw $12, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
-; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; X86-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; X86-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; X86-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; X86-NEXT:    kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleud %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqd %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltud %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
+; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -10212,29 +10362,53 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
-; CHECK-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x08,0x37,0xc8]
-; CHECK-NEXT:    vpcmpleq %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltq %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x05]
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xe9]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_cmp_q_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0 # encoding: [0x62,0xf2,0xf5,0x08,0x37,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_q_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
+; X64-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1 # encoding: [0x62,0xf2,0xf5,0x08,0x37,0xc8]
+; X64-NEXT:    vpcmpleq %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02]
+; X64-NEXT:    vpcmpneqq %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltq %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x05]
+; X64-NEXT:    vpcmpgtq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf2,0xfd,0x08,0x37,0xe9]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
@@ -10259,27 +10433,27 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x37,0xc0]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
-; X86-NEXT:    vpcmpgtq %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x37,0xd0]
-; X86-NEXT:    vpcmpleq %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02]
-; X86-NEXT:    vpcmpneqq %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltq %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x05]
-; X86-NEXT:    vpcmpgtq %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x37,0xf1]
-; X86-NEXT:    kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e]
-; X86-NEXT:    kshiftrw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e]
-; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; X86-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; X86-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; X86-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; X86-NEXT:    kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
+; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -10331,29 +10505,53 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
 
 define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
-; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01]
-; CHECK-NEXT:    vpcmpleuq %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x05]
-; CHECK-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x06]
-; CHECK-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; CHECK-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
-; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
-; CHECK-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-LABEL: test_ucmp_q_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
+; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_q_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
+; X64-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01]
+; X64-NEXT:    vpcmpleuq %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02]
+; X64-NEXT:    vpcmpneqq %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04]
+; X64-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x05]
+; X64-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x06]
+; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X64-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X64-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X64-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X64-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X64-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X64-NEXT:    movl $3, %eax # encoding: [0xb8,0x03,0x00,0x00,0x00]
+; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
@@ -10378,27 +10576,27 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x01]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
-; X86-NEXT:    vpcmpltuq %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01]
-; X86-NEXT:    vpcmpleuq %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02]
-; X86-NEXT:    vpcmpneqq %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x04]
-; X86-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x05]
-; X86-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k6 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x06]
-; X86-NEXT:    kshiftlw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e]
-; X86-NEXT:    kshiftrw $14, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e]
-; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
 ; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
-; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
-; X86-NEXT:    kmovw %k3, %eax # encoding: [0xc5,0xf8,0x93,0xc3]
-; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; X86-NEXT:    kmovw %k4, %eax # encoding: [0xc5,0xf8,0x93,0xc4]
-; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; X86-NEXT:    kmovw %k5, %eax # encoding: [0xc5,0xf8,0x93,0xc5]
-; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
-; X86-NEXT:    kmovw %k6, %eax # encoding: [0xc5,0xf8,0x93,0xc6]
-; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1]
+; X86-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x01]
+; X86-NEXT:    vpcmpleuq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x02]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x02]
+; X86-NEXT:    vpcmpneqq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x04]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x04]
+; X86-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x05]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x20,0xd0,0x05]
+; X86-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x06]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm0 # encoding: [0xc4,0xe3,0x69,0x20,0xc0,0x06]
+; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
+; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -10504,9 +10702,9 @@ define <8 x float>@test_int_x86_avx512_maskz_broadcastf32x4_256(<4 x float> %x0,
 define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(ptr %x0ptr, <8 x float> %x2, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256_load:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vbroadcastf32x4 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x1a,0x00]
 ; X86-NEXT:    # ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -10577,9 +10775,9 @@ define <8 x i32>@test_int_x86_avx512_maskz_broadcasti32x4_256(<4 x i32> %x0, i8
 define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(ptr %x0ptr, <8 x i32> %x2, i8 %mask) {
 ; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256_load:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vbroadcasti32x4 (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x5a,0x00]
 ; X86-NEXT:    # ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -10724,8 +10922,8 @@ define i8 at test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 ; X86-NEXT:    addb %cl, %al # encoding: [0x00,0xc8]
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -10783,8 +10981,8 @@ define i8 at test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 ; X86-NEXT:    addb %cl, %al # encoding: [0x00,0xc8]
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -10814,8 +11012,8 @@ define i8 at test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 ; X86-NEXT:    addb %cl, %al # encoding: [0x00,0xc8]
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -10847,8 +11045,8 @@ define i8 at test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 ; X86-NEXT:    addb %cl, %al # encoding: [0x00,0xc8]
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -10906,8 +11104,8 @@ define i8 at test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 ; X86-NEXT:    addb %cl, %al # encoding: [0x00,0xc8]
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -10937,8 +11135,8 @@ define i8 at test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
 ; X86-NEXT:    addb %cl, %al # encoding: [0x00,0xc8]
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -11075,9 +11273,9 @@ define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, ptr %ptr_b) {
 define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x28,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11096,9 +11294,9 @@ define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x
 define < 2 x i64> @test_mask_mul_epi32_rmkz_128(< 4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x28,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11134,9 +11332,9 @@ define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, ptr %ptr_b) {
 define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x28,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11158,9 +11356,9 @@ define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x
 define < 2 x i64> @test_mask_mul_epi32_rmbk_128_buildvector(< 4 x i32> %a, ptr %ptr_b, < 2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbk_128_buildvector:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x28,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11182,9 +11380,9 @@ define < 2 x i64> @test_mask_mul_epi32_rmbk_128_buildvector(< 4 x i32> %a, ptr %
 define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x28,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11204,9 +11402,9 @@ define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, ptr %ptr_b, i8 %
 define < 2 x i64> @test_mask_mul_epi32_rmbkz_128_buildvector(< 4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbkz_128_buildvector:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x28,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11289,9 +11487,9 @@ define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, ptr %ptr_b) {
 define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x28,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11310,9 +11508,9 @@ define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x
 define < 4 x i64> @test_mask_mul_epi32_rmkz_256(< 8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11348,9 +11546,9 @@ define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, ptr %ptr_b) {
 define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x39,0x28,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11372,9 +11570,9 @@ define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x
 define < 4 x i64> @test_mask_mul_epi32_rmbk_256_buildvector(< 8 x i32> %a, ptr %ptr_b, < 4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbk_256_buildvector:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x39,0x28,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11398,9 +11596,9 @@ define < 4 x i64> @test_mask_mul_epi32_rmbk_256_buildvector(< 8 x i32> %a, ptr %
 define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11420,9 +11618,9 @@ define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, ptr %ptr_b, i8 %
 define < 4 x i64> @test_mask_mul_epi32_rmbkz_256_buildvector(< 8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epi32_rmbkz_256_buildvector:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuldq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11507,9 +11705,9 @@ define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, ptr %ptr_b) {
 define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmk_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x08]
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11528,9 +11726,9 @@ define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, ptr %ptr_b, < 2 x
 define < 2 x i64> @test_mask_mul_epu32_rmkz_128(< 4 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmkz_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -11687,9 +11885,9 @@ define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, ptr %ptr_b) {
 define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x i64> %passThru, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmk_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x08]
 ; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -11708,9 +11906,9 @@ define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, ptr %ptr_b, < 4 x
 define < 4 x i64> @test_mask_mul_epu32_rmkz_256(< 8 x i32> %a, ptr %ptr_b, i8 %mask) {
 ; X86-LABEL: test_mask_mul_epu32_rmkz_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmuludq (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13091,9 +13289,9 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x
 define void @test_mask_compress_store_pd_128(ptr %addr, <2 x double> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_pd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vcompresspd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x8a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13111,8 +13309,8 @@ declare void @llvm.x86.avx512.mask.compress.store.pd.128(ptr %addr, <2 x double>
 define void @test_compress_store_pd_128(ptr %addr, <2 x double> %data) {
 ; X86-LABEL: test_compress_store_pd_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompresspd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x8a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13128,9 +13326,9 @@ define void @test_compress_store_pd_128(ptr %addr, <2 x double> %data) {
 define void @test_mask_compress_store_ps_128(ptr %addr, <4 x float> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_ps_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vcompressps %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13148,8 +13346,8 @@ declare void @llvm.x86.avx512.mask.compress.store.ps.128(ptr %addr, <4 x float>
 define void @test_compress_store_ps_128(ptr %addr, <4 x float> %data) {
 ; X86-LABEL: test_compress_store_ps_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompressps %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13165,9 +13363,9 @@ define void @test_compress_store_ps_128(ptr %addr, <4 x float> %data) {
 define void @test_mask_compress_store_q_128(ptr %addr, <2 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpcompressq %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x8b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13185,8 +13383,8 @@ declare void @llvm.x86.avx512.mask.compress.store.q.128(ptr %addr, <2 x i64> %da
 define void @test_compress_store_q_128(ptr %addr, <2 x i64> %data) {
 ; X86-LABEL: test_compress_store_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressq %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x8b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13202,9 +13400,9 @@ define void @test_compress_store_q_128(ptr %addr, <2 x i64> %data) {
 define void @test_mask_compress_store_d_128(ptr %addr, <4 x i32> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpcompressd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x8b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13222,8 +13420,8 @@ declare void @llvm.x86.avx512.mask.compress.store.d.128(ptr %addr, <4 x i32> %da
 define void @test_compress_store_d_128(ptr %addr, <4 x i32> %data) {
 ; X86-LABEL: test_compress_store_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x8b,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13239,9 +13437,9 @@ define void @test_compress_store_d_128(ptr %addr, <4 x i32> %data) {
 define <2 x double> @test_mask_expand_load_pd_128(ptr %addr, <2 x double> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_pd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13257,9 +13455,9 @@ define <2 x double> @test_mask_expand_load_pd_128(ptr %addr, <2 x double> %data,
 define <2 x double> @test_maskz_expand_load_pd_128(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_pd_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13277,8 +13475,8 @@ declare <2 x double> @llvm.x86.avx512.mask.expand.load.pd.128(ptr %addr, <2 x do
 define <2 x double> @test_expand_load_pd_128(ptr %addr, <2 x double> %data) {
 ; X86-LABEL: test_expand_load_pd_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vexpandpd (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13294,9 +13492,9 @@ define <2 x double> @test_expand_load_pd_128(ptr %addr, <2 x double> %data) {
 define <4 x float> @test_mask_expand_load_ps_128(ptr %addr, <4 x float> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_ps_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13312,9 +13510,9 @@ define <4 x float> @test_mask_expand_load_ps_128(ptr %addr, <4 x float> %data, i
 define <4 x float> @test_maskz_expand_load_ps_128(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_ps_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13332,8 +13530,8 @@ declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(ptr %addr, <4 x flo
 define <4 x float> @test_expand_load_ps_128(ptr %addr, <4 x float> %data) {
 ; X86-LABEL: test_expand_load_ps_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vexpandps (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13349,9 +13547,9 @@ define <4 x float> @test_expand_load_ps_128(ptr %addr, <4 x float> %data) {
 define <2 x i64> @test_mask_expand_load_q_128(ptr %addr, <2 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13367,9 +13565,9 @@ define <2 x i64> @test_mask_expand_load_q_128(ptr %addr, <2 x i64> %data, i8 %ma
 define <2 x i64> @test_maskz_expand_load_q_128(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_q_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13387,8 +13585,8 @@ declare <2 x i64> @llvm.x86.avx512.mask.expand.load.q.128(ptr %addr, <2 x i64> %
 define <2 x i64> @test_expand_load_q_128(ptr %addr, <2 x i64> %data) {
 ; X86-LABEL: test_expand_load_q_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13404,9 +13602,9 @@ define <2 x i64> @test_expand_load_q_128(ptr %addr, <2 x i64> %data) {
 define <4 x i32> @test_mask_expand_load_d_128(ptr %addr, <4 x i32> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13422,9 +13620,9 @@ define <4 x i32> @test_mask_expand_load_d_128(ptr %addr, <4 x i32> %data, i8 %ma
 define <4 x i32> @test_maskz_expand_load_d_128(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_d_128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13442,8 +13640,8 @@ declare <4 x i32> @llvm.x86.avx512.mask.expand.load.d.128(ptr %addr, <4 x i32> %
 define <4 x i32> @test_expand_load_d_128(ptr %addr, <4 x i32> %data) {
 ; X86-LABEL: test_expand_load_d_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandd (%eax), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13459,9 +13657,9 @@ define <4 x i32> @test_expand_load_d_128(ptr %addr, <4 x i32> %data) {
 define void @test_mask_compress_store_pd_256(ptr %addr, <4 x double> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_pd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vcompresspd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13481,8 +13679,8 @@ declare void @llvm.x86.avx512.mask.compress.store.pd.256(ptr %addr, <4 x double>
 define void @test_compress_store_pd_256(ptr %addr, <4 x double> %data) {
 ; X86-LABEL: test_compress_store_pd_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompresspd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13500,9 +13698,9 @@ define void @test_compress_store_pd_256(ptr %addr, <4 x double> %data) {
 define void @test_mask_compress_store_ps_256(ptr %addr, <8 x float> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vcompressps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x8a,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13522,8 +13720,8 @@ declare void @llvm.x86.avx512.mask.compress.store.ps.256(ptr %addr, <8 x float>
 define void @test_compress_store_ps_256(ptr %addr, <8 x float> %data) {
 ; X86-LABEL: test_compress_store_ps_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vcompressps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x8a,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13541,9 +13739,9 @@ define void @test_compress_store_ps_256(ptr %addr, <8 x float> %data) {
 define void @test_mask_compress_store_q_256(ptr %addr, <4 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpcompressq %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x8b,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13563,8 +13761,8 @@ declare void @llvm.x86.avx512.mask.compress.store.q.256(ptr %addr, <4 x i64> %da
 define void @test_compress_store_q_256(ptr %addr, <4 x i64> %data) {
 ; X86-LABEL: test_compress_store_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressq %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x8b,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13582,9 +13780,9 @@ define void @test_compress_store_q_256(ptr %addr, <4 x i64> %data) {
 define void @test_mask_compress_store_d_256(ptr %addr, <8 x i32> %data, i8 %mask) {
 ; X86-LABEL: test_mask_compress_store_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpcompressd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x8b,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13604,8 +13802,8 @@ declare void @llvm.x86.avx512.mask.compress.store.d.256(ptr %addr, <8 x i32> %da
 define void @test_compress_store_d_256(ptr %addr, <8 x i32> %data) {
 ; X86-LABEL: test_compress_store_d_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpcompressd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x8b,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -13623,9 +13821,9 @@ define void @test_compress_store_d_256(ptr %addr, <8 x i32> %data) {
 define <4 x double> @test_mask_expand_load_pd_256(ptr %addr, <4 x double> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_pd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13641,9 +13839,9 @@ define <4 x double> @test_mask_expand_load_pd_256(ptr %addr, <4 x double> %data,
 define <4 x double> @test_maskz_expand_load_pd_256(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_pd_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13661,8 +13859,8 @@ declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(ptr %addr, <4 x do
 define <4 x double> @test_expand_load_pd_256(ptr %addr, <4 x double> %data) {
 ; X86-LABEL: test_expand_load_pd_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vexpandpd (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13678,9 +13876,9 @@ define <4 x double> @test_expand_load_pd_256(ptr %addr, <4 x double> %data) {
 define <8 x float> @test_mask_expand_load_ps_256(ptr %addr, <8 x float> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13696,9 +13894,9 @@ define <8 x float> @test_mask_expand_load_ps_256(ptr %addr, <8 x float> %data, i
 define <8 x float> @test_maskz_expand_load_ps_256(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_ps_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13716,8 +13914,8 @@ declare <8 x float> @llvm.x86.avx512.mask.expand.load.ps.256(ptr %addr, <8 x flo
 define <8 x float> @test_expand_load_ps_256(ptr %addr, <8 x float> %data) {
 ; X86-LABEL: test_expand_load_ps_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vexpandps (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x88,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13733,9 +13931,9 @@ define <8 x float> @test_expand_load_ps_256(ptr %addr, <8 x float> %data) {
 define <4 x i64> @test_mask_expand_load_q_256(ptr %addr, <4 x i64> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13751,9 +13949,9 @@ define <4 x i64> @test_mask_expand_load_q_256(ptr %addr, <4 x i64> %data, i8 %ma
 define <4 x i64> @test_maskz_expand_load_q_256(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_q_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13771,8 +13969,8 @@ declare <4 x i64> @llvm.x86.avx512.mask.expand.load.q.256(ptr %addr, <4 x i64> %
 define <4 x i64> @test_expand_load_q_256(ptr %addr, <4 x i64> %data) {
 ; X86-LABEL: test_expand_load_q_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandq (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13788,9 +13986,9 @@ define <4 x i64> @test_expand_load_q_256(ptr %addr, <4 x i64> %data) {
 define <8 x i32> @test_mask_expand_load_d_256(ptr %addr, <8 x i32> %data, i8 %mask) {
 ; X86-LABEL: test_mask_expand_load_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13806,9 +14004,9 @@ define <8 x i32> @test_mask_expand_load_d_256(ptr %addr, <8 x i32> %data, i8 %ma
 define <8 x i32> @test_maskz_expand_load_d_256(ptr %addr, i8 %mask) {
 ; X86-LABEL: test_maskz_expand_load_d_256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -13826,8 +14024,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(ptr %addr, <8 x i32> %
 define <8 x i32> @test_expand_load_d_256(ptr %addr, <8 x i32> %data) {
 ; X86-LABEL: test_expand_load_d_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kxnorw %k0, %k0, %k1 # encoding: [0xc5,0xfc,0x46,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpexpandd (%eax), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x89,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -16359,9 +16557,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
 define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmk:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -16380,9 +16578,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1,
 define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmka:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -16437,9 +16635,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a
 define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmb:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax){1to4}, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x19,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -16462,9 +16660,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
 define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmba:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax){1to4}, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x19,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -16531,9 +16729,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
 define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_pd_rmk:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213pd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -16570,9 +16768,9 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double>
 define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd256_pd_rmk:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213pd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x00]
 ; X86-NEXT:    # ymm0 {%k1} = (ymm1 * ymm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -17404,12 +17602,12 @@ define void @test_cmp_128(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x f
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
 ; X86-NEXT:    subl $16, %esp # encoding: [0x83,0xec,0x10]
-; X86-NEXT:    movl 24(%ebp), %eax # encoding: [0x8b,0x45,0x18]
 ; X86-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
 ; X86-NEXT:    vcmpltps 8(%ebp), %xmm2, %k1 # encoding: [0x62,0xf1,0x6c,0x08,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01]
 ; X86-NEXT:    kshiftlw $4, %k1, %k1 # encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x04]
 ; X86-NEXT:    korw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x45,0xc9]
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; X86-NEXT:    movl 24(%ebp), %eax # encoding: [0x8b,0x45,0x18]
 ; X86-NEXT:    vmovaps %ymm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x29,0x00]
 ; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
 ; X86-NEXT:    popl %ebp # encoding: [0x5d]
@@ -17445,11 +17643,11 @@ define void @test_cmp_256(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x f
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
 ; X86-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
-; X86-NEXT:    movl 40(%ebp), %eax # encoding: [0x8b,0x45,0x28]
 ; X86-NEXT:    vcmpltps %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x01]
 ; X86-NEXT:    vcmpltps 8(%ebp), %ymm2, %k1 # encoding: [0x62,0xf1,0x6c,0x28,0xc2,0x8d,0x08,0x00,0x00,0x00,0x01]
 ; X86-NEXT:    kunpckbw %k0, %k1, %k1 # encoding: [0xc5,0xf5,0x4b,0xc8]
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; X86-NEXT:    movl 40(%ebp), %eax # encoding: [0x8b,0x45,0x28]
 ; X86-NEXT:    vmovaps %zmm0, (%eax) {%k1} # encoding: [0x62,0xf1,0x7c,0x49,0x29,0x00]
 ; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
 ; X86-NEXT:    popl %ebp # encoding: [0x5d]
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 7fe32c293420d..d4f454273e070 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -1663,8 +1663,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2]
 ; X86-NEXT:    vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; X86-NEXT:    vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -1692,10 +1692,10 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovqb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x32,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovqb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1717,8 +1717,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2]
 ; X86-NEXT:    vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; X86-NEXT:    vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -1746,10 +1746,10 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsqb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x22,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsqb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1771,8 +1771,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2]
 ; X86-NEXT:    vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; X86-NEXT:    vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -1800,10 +1800,10 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusqb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x12,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusqb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1825,8 +1825,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2]
 ; X86-NEXT:    vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; X86-NEXT:    vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -1856,10 +1856,10 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovqb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x32,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovqb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1883,8 +1883,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2]
 ; X86-NEXT:    vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; X86-NEXT:    vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -1914,10 +1914,10 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsqb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x22,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsqb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1941,8 +1941,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2]
 ; X86-NEXT:    vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; X86-NEXT:    vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -1972,10 +1972,10 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusqb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x12,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusqb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -1999,8 +1999,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2]
 ; X86-NEXT:    vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; X86-NEXT:    vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -2028,10 +2028,10 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovqw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x34,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovqw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2053,8 +2053,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2]
 ; X86-NEXT:    vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; X86-NEXT:    vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -2082,10 +2082,10 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsqw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x24,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsqw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2107,8 +2107,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2]
 ; X86-NEXT:    vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; X86-NEXT:    vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -2136,10 +2136,10 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusqw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x14,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusqw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2161,8 +2161,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2]
 ; X86-NEXT:    vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; X86-NEXT:    vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -2192,10 +2192,10 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovqw %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x34,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovqw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2219,8 +2219,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2]
 ; X86-NEXT:    vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; X86-NEXT:    vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -2250,10 +2250,10 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsqw %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x24,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsqw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2277,8 +2277,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2]
 ; X86-NEXT:    vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; X86-NEXT:    vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -2308,10 +2308,10 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusqw %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x14,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusqw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2335,8 +2335,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2]
 ; X86-NEXT:    vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; X86-NEXT:    vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2]
 ; X86-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X86-NEXT:    vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
@@ -2364,10 +2364,10 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovqd %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x35,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovqd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2389,8 +2389,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2]
 ; X86-NEXT:    vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; X86-NEXT:    vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2]
 ; X86-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X86-NEXT:    vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
@@ -2418,10 +2418,10 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsqd %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x25,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsqd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2443,8 +2443,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2]
 ; X86-NEXT:    vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; X86-NEXT:    vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2]
 ; X86-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X86-NEXT:    vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
@@ -2472,10 +2472,10 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusqd %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x15,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusqd %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2551,10 +2551,10 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovqd %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x35,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovqd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2628,10 +2628,10 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsqd %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x25,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsqd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x25,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2705,10 +2705,10 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusqd %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x15,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusqd %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x15,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2732,8 +2732,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2]
 ; X86-NEXT:    vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; X86-NEXT:    vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -2761,10 +2761,10 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmov_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovdb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x31,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovdb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2786,8 +2786,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2]
 ; X86-NEXT:    vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; X86-NEXT:    vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -2815,10 +2815,10 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovs_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsdb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x21,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsdb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2840,8 +2840,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2]
 ; X86-NEXT:    vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; X86-NEXT:    vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -2869,10 +2869,10 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovus_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusdb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x11,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusdb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2894,8 +2894,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2]
 ; X86-NEXT:    vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; X86-NEXT:    vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -2925,10 +2925,10 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmov_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovdb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x31,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovdb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2952,8 +2952,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2]
 ; X86-NEXT:    vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; X86-NEXT:    vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -2983,10 +2983,10 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovs_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsdb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x21,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsdb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3010,8 +3010,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2]
 ; X86-NEXT:    vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; X86-NEXT:    vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2]
 ; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -3041,10 +3041,10 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovus_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusdb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x11,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusdb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3068,8 +3068,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2]
 ; X86-NEXT:    vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; X86-NEXT:    vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3097,10 +3097,10 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmov_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovdw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3122,8 +3122,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2]
 ; X86-NEXT:    vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; X86-NEXT:    vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3151,10 +3151,10 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x23,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsdw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3176,8 +3176,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2]
 ; X86-NEXT:    vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; X86-NEXT:    vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3205,10 +3205,10 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x13,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusdw %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0x00]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3230,8 +3230,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2]
 ; X86-NEXT:    vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; X86-NEXT:    vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3261,10 +3261,10 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmov_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovdw %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x33,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovdw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3288,8 +3288,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2]
 ; X86-NEXT:    vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; X86-NEXT:    vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3319,10 +3319,10 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovsdw %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x23,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovsdw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -3346,8 +3346,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2]
 ; X86-NEXT:    vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; X86-NEXT:    vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2]
 ; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3377,10 +3377,10 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32>, i8)
 define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vpmovusdw %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x13,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpmovusdw %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0x00]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4046,10 +4046,10 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vgetmantpd $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b]
-; X86-NEXT:    vgetmantpd $12, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x26,0xd0,0x0c]
-; X86-NEXT:    vgetmantpd $13, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x26,0xc0,0x0d]
-; X86-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
-; X86-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0]
+; X86-NEXT:    vgetmantpd $13, %xmm0, %xmm2 # encoding: [0x62,0xf3,0xfd,0x08,0x26,0xd0,0x0d]
+; X86-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
+; X86-NEXT:    vgetmantpd $12, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x26,0xc0,0x0c]
+; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
@@ -4375,11 +4375,11 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
-; X86-NEXT:    vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
-; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
 ; X86-NEXT:    vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
-; X86-NEXT:    vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
+; X86-NEXT:    vcvtps2ph $10, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x0a]
+; X86-NEXT:    vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02]
+; X86-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_vcvtps2ph_128:
@@ -4406,11 +4406,11 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
-; X86-NEXT:    vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
-; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
 ; X86-NEXT:    vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
-; X86-NEXT:    vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
+; X86-NEXT:    vcvtps2ph $11, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x0b]
+; X86-NEXT:    vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02]
+; X86-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4919,11 +4919,11 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
-; X86-NEXT:    vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0x57,0xe4]
-; X86-NEXT:    vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
-; X86-NEXT:    vaddpd %xmm4, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc]
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x57,0xdb]
+; X86-NEXT:    vfixupimmpd $4, %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x04]
+; X86-NEXT:    vmovapd %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xe0]
+; X86-NEXT:    vfixupimmpd $5, %xmm2, %xmm1, %xmm4 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x54,0xe2,0x05]
+; X86-NEXT:    vaddpd %xmm3, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x58,0xdb]
 ; X86-NEXT:    vfixupimmpd $3, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
 ; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -4954,11 +4954,11 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
-; X86-NEXT:    vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x05]
-; X86-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x57,0xd2]
-; X86-NEXT:    vfixupimmpd $3, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x54,0xc2,0x03]
-; X86-NEXT:    vaddpd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x57,0xdb]
+; X86-NEXT:    vmovapd %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xe0]
+; X86-NEXT:    vfixupimmpd $3, %xmm3, %xmm1, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe3,0x03]
+; X86-NEXT:    vfixupimmpd $5, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x54,0xc2,0x05]
+; X86-NEXT:    vaddpd %xmm4, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc4]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_128:
@@ -4985,11 +4985,11 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
-; X86-NEXT:    vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0x57,0xe4]
-; X86-NEXT:    vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
-; X86-NEXT:    vaddpd %ymm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x57,0xdb]
+; X86-NEXT:    vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05]
+; X86-NEXT:    vmovapd %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe0]
+; X86-NEXT:    vfixupimmpd $4, %ymm2, %ymm1, %ymm4 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x54,0xe2,0x04]
+; X86-NEXT:    vaddpd %ymm3, %ymm4, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0x58,0xdb]
 ; X86-NEXT:    vfixupimmpd $3, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
 ; X86-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5020,12 +5020,12 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vxorpd %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe1,0x57,0xdb]
+; X86-NEXT:    vmovapd %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe0]
+; X86-NEXT:    vfixupimmpd $4, %ymm3, %ymm1, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe3,0x04]
 ; X86-NEXT:    vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
 ; X86-NEXT:    vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05]
-; X86-NEXT:    vxorpd %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0x57,0xe4]
-; X86-NEXT:    vmovapd %ymm0, %ymm5 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
-; X86-NEXT:    vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
-; X86-NEXT:    vaddpd %ymm5, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
+; X86-NEXT:    vaddpd %ymm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
 ; X86-NEXT:    vfixupimmpd $3, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
 ; X86-NEXT:    vaddpd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5057,12 +5057,12 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe0,0x57,0xdb]
+; X86-NEXT:    vmovaps %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0]
+; X86-NEXT:    vfixupimmps $6, %xmm3, %xmm1, %xmm4 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x54,0xe3,0x06]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
 ; X86-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x54,0xda,0x05]
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd8,0x57,0xe4]
-; X86-NEXT:    vmovaps %xmm0, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe8]
-; X86-NEXT:    vfixupimmps $6, %xmm4, %xmm1, %xmm5 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x54,0xec,0x06]
-; X86-NEXT:    vaddps %xmm5, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xdd]
+; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xdc]
 ; X86-NEXT:    vfixupimmps $7, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x54,0xc2,0x07]
 ; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5094,12 +5094,12 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe0,0x57,0xdb]
+; X86-NEXT:    vmovaps %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0]
+; X86-NEXT:    vfixupimmps $6, %xmm3, %xmm1, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x54,0xe3,0x06]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
 ; X86-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x54,0xda,0x05]
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd8,0x57,0xe4]
-; X86-NEXT:    vmovaps %xmm0, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe8]
-; X86-NEXT:    vfixupimmps $6, %xmm4, %xmm1, %xmm5 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x54,0xec,0x06]
-; X86-NEXT:    vaddps %xmm5, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xdd]
+; X86-NEXT:    vaddps %xmm4, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xdc]
 ; X86-NEXT:    vfixupimmps $7, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x54,0xc2,0x07]
 ; X86-NEXT:    vaddps %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5131,12 +5131,12 @@ define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe0,0x57,0xdb]
+; X86-NEXT:    vmovaps %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0]
+; X86-NEXT:    vfixupimmps $6, %ymm3, %ymm1, %ymm4 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x54,0xe3,0x06]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
 ; X86-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x54,0xda,0x05]
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd8,0x57,0xe4]
-; X86-NEXT:    vmovaps %ymm0, %ymm5 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe8]
-; X86-NEXT:    vfixupimmps $6, %ymm4, %ymm1, %ymm5 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x54,0xec,0x06]
-; X86-NEXT:    vaddps %ymm5, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xdd]
+; X86-NEXT:    vaddps %ymm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xdc]
 ; X86-NEXT:    vfixupimmps $7, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x54,0xc2,0x07]
 ; X86-NEXT:    vaddps %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -5168,12 +5168,12 @@ define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3 # encoding: [0xc5,0xe0,0x57,0xdb]
+; X86-NEXT:    vmovaps %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0]
+; X86-NEXT:    vfixupimmps $6, %ymm3, %ymm1, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x54,0xe3,0x06]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
 ; X86-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x54,0xda,0x05]
-; X86-NEXT:    vxorps %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd8,0x57,0xe4]
-; X86-NEXT:    vmovaps %ymm0, %ymm5 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe8]
-; X86-NEXT:    vfixupimmps $6, %ymm4, %ymm1, %ymm5 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x54,0xec,0x06]
-; X86-NEXT:    vaddps %ymm5, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xdd]
+; X86-NEXT:    vaddps %ymm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xdc]
 ; X86-NEXT:    vfixupimmps $7, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x54,0xc2,0x07]
 ; X86-NEXT:    vaddps %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -6763,9 +6763,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
 define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmk:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -6787,9 +6787,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1,
 define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmka:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -6847,9 +6847,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a
 define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmb:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax){1to4}, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x19,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -6875,9 +6875,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
 define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_ps_rmba:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213ps (%eax){1to4}, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x19,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -6947,9 +6947,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
 define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd128_pd_rmk:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213pd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x00]
 ; X86-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -6989,9 +6989,9 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double>
 define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2, i8 %mask) {
 ; X86-LABEL: test_mask_vfmadd256_pd_rmk:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vfmadd213pd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x00]
 ; X86-NEXT:    # ymm0 {%k1} = (ymm1 * ymm0) + mem
 ; X86-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 0855baf2e1ba1..d109c1c9a13b3 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -18,9 +18,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x00]
 ; X86-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -58,9 +58,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x00]
 ; X86-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -98,9 +98,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x00]
 ; X86-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -138,9 +138,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x00]
 ; X86-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -178,9 +178,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x00]
 ; X86-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -218,9 +218,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x00]
 ; X86-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -259,9 +259,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x00]
 ; X86-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -299,9 +299,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x00]
 ; X86-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index ddf0050dbd74a..782db5427aeaa 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -17,9 +17,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x50,0x00]
 ; X86-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -60,9 +60,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x50,0x00]
 ; X86-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -105,9 +105,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x51,0x00]
 ; X86-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -148,9 +148,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpbusds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x51,0x00]
 ; X86-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -193,9 +193,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssd (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x52,0x00]
 ; X86-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -236,9 +236,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssd (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x52,0x00]
 ; X86-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
@@ -281,9 +281,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssds (%eax), %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x53,0x00]
 ; X86-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
 ; X86-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
@@ -331,9 +331,9 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
-; X86-NEXT:    kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9]
 ; X86-NEXT:    vpdpwssds (%eax), %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x53,0x00]
 ; X86-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
 ; X86-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
diff --git a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll
index 9741972767bcd..06de7b48d52e5 100644
--- a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll
@@ -5,13 +5,13 @@
 define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm256_2intersect_epi32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    movb %cl, (%edx) # encoding: [0x88,0x0a]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -38,17 +38,17 @@ entry:
 define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm256_2intersect_epi64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -81,16 +81,16 @@ entry:
 define void @test_mm256_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm256_2intersect_epi32_p:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vmovdqa (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x02]
-; X86-NEXT:    vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectd (%eax), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x00]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    movb %cl, (%edx) # encoding: [0x88,0x0a]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -118,25 +118,20 @@ entry:
 define void @test_mm256_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm256_2intersect_epi64_p:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vmovdqa (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x06]
-; X86-NEXT:    vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectq (%eax), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x00]
 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x0c]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -172,16 +167,16 @@ entry:
 define void @test_mm256_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm256_2intersect_epi32_b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vpbroadcastd (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x02]
-; X86-NEXT:    vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastd (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectd (%eax){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x00]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    movb %cl, (%edx) # encoding: [0x88,0x0a]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -213,25 +208,20 @@ entry:
 define void @test_mm256_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm256_2intersect_epi64_b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vpbroadcastq (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x06]
-; X86-NEXT:    vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastq (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectq (%eax){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x00]
 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x0c]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -271,17 +261,17 @@ entry:
 define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm_2intersect_epi32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm_2intersect_epi32:
@@ -314,17 +304,17 @@ entry:
 define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm_2intersect_epi64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm_2intersect_epi64:
@@ -355,25 +345,20 @@ entry:
 define void @test_mm_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm_2intersect_epi32_p:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06]
-; X86-NEXT:    vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectd (%eax), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x00]
 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x0c]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm_2intersect_epi32_p:
@@ -407,25 +392,20 @@ entry:
 define void @test_mm_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm_2intersect_epi64_p:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vmovdqa (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x06]
-; X86-NEXT:    vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectq (%eax), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x00]
 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x0c]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm_2intersect_epi64_p:
@@ -459,25 +439,20 @@ entry:
 define void @test_mm_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm_2intersect_epi32_b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vpbroadcastd (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x06]
-; X86-NEXT:    vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectd (%eax){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x00]
 ; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x0c]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
 ; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm_2intersect_epi32_b:
@@ -515,25 +490,20 @@ entry:
 define void @test_mm_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm_2intersect_epi64_b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vpbroadcastq (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x06]
-; X86-NEXT:    vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastq (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectq (%eax){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x00]
 ; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
 ; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
-; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
-; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
+; X86-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x0c]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
 ; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
-; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_mm_2intersect_epi64_b:
diff --git a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll
index 28e3d6dd5d849..b66c2367b7dbf 100644
--- a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll
@@ -5,10 +5,10 @@
 define void @test_mm512_2intersect_epi32(<8 x i64> %a, <8 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm512_2intersect_epi32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    vp2intersectd %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0xc1]
-; X86-NEXT:    kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %k0, (%eax) # encoding: [0xc5,0xf8,0x91,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -34,13 +34,13 @@ entry:
 define void @test_mm512_2intersect_epi64(<8 x i64> %a, <8 x i64> %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm512_2intersect_epi64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    movb %cl, (%edx) # encoding: [0x88,0x0a]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -65,19 +65,14 @@ entry:
 define void @test_mm512_2intersect_epi32_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm512_2intersect_epi32_p:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vmovdqa64 (%esi), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x06]
-; X86-NEXT:    vp2intersectd (%edx), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x02]
-; X86-NEXT:    kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 (%eax), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectd (%eax), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %k0, (%eax) # encoding: [0xc5,0xf8,0x91,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
 ; X86-NEXT:    kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -103,16 +98,16 @@ entry:
 define void @test_mm512_2intersect_epi64_p(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm512_2intersect_epi64_p:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vmovdqa64 (%edx), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x02]
-; X86-NEXT:    vp2intersectq (%ecx), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x01]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 (%eax), %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectq (%eax), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x00]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    movb %cl, (%edx) # encoding: [0x88,0x0a]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -141,19 +136,14 @@ entry:
 define void @test_mm512_2intersect_epi32_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm512_2intersect_epi32_b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-NEXT:    vpbroadcastd (%esi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x58,0x06]
-; X86-NEXT:    vp2intersectd (%edx){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x02]
-; X86-NEXT:    kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastd (%eax), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x58,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectd (%eax){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %k0, (%eax) # encoding: [0xc5,0xf8,0x91,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
 ; X86-NEXT:    kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08]
-; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -183,16 +173,16 @@ entry:
 define void @test_mm512_2intersect_epi64_b(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %m0, ptr nocapture %m1) {
 ; X86-LABEL: test_mm512_2intersect_epi64_b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vpbroadcastq (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x02]
-; X86-NEXT:    vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01]
-; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
-; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
-; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
-; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastq (%eax), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x00]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    vp2intersectq (%eax){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x00]
+; X86-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    movb %cl, (%edx) # encoding: [0x88,0x0a]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-NEXT:    movb %al, (%ecx) # encoding: [0x88,0x01]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
index 6c3d90aab77e8..df54c26bc9d51 100644
--- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -10,11 +10,11 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x18]
-; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0x00]
+; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
@@ -27,11 +27,11 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, pt
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x18]
-; AVX10-X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
-; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0x00]
+; AVX10-X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
@@ -53,11 +53,11 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x18]
-; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0x00]
+; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
@@ -70,11 +70,11 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, p
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x18]
-; AVX10-X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
-; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0x00]
+; AVX10-X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
@@ -96,11 +96,11 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
-; X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x18]
-; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0x00]
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
@@ -113,11 +113,11 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, pt
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x18]
-; AVX10-X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
-; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0x00]
+; AVX10-X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
@@ -139,11 +139,11 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
-; X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x18]
-; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0x00]
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
@@ -156,11 +156,11 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, p
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x18]
-; AVX10-X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
-; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0x00]
+; AVX10-X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
@@ -182,11 +182,11 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x18]
-; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0x00]
+; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
@@ -199,11 +199,11 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, pt
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x18]
-; AVX10-X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
-; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0x00]
+; AVX10-X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
@@ -225,11 +225,11 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x18]
-; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0x00]
+; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
@@ -242,11 +242,11 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, p
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x18]
-; AVX10-X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
-; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0x00]
+; AVX10-X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
@@ -268,11 +268,11 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
-; X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x18]
-; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0x00]
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
@@ -285,11 +285,11 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, pt
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x18]
-; AVX10-X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
-; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0x00]
+; AVX10-X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
@@ -311,11 +311,11 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
-; X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x18]
-; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0x00]
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
@@ -328,11 +328,11 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, p
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x18]
-; AVX10-X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
-; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0x00]
+; AVX10-X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
@@ -354,11 +354,11 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x18]
-; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0x00]
+; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
@@ -371,11 +371,11 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, pt
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x18]
-; AVX10-X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
-; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0x00]
+; AVX10-X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
@@ -397,11 +397,11 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
-; X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x18]
-; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
-; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0x00]
+; X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
@@ -414,11 +414,11 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, p
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x18]
-; AVX10-X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
-; AVX10-X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0x00]
+; AVX10-X86-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
@@ -440,11 +440,11 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
-; X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x18]
-; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0x00]
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
@@ -457,11 +457,11 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, pt
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x18]
-; AVX10-X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
-; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0x00]
+; AVX10-X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
@@ -483,11 +483,11 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
-; X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x18]
-; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
-; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0xda]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0x00]
+; X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc3]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
@@ -500,11 +500,11 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, p
 ;
 ; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
 ; AVX10-X86:       # %bb.0:
-; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; AVX10-X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; AVX10-X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x18]
-; AVX10-X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
-; AVX10-X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; AVX10-X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xda]
+; AVX10-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; AVX10-X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0x00]
+; AVX10-X86-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; AVX10-X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
index f363cad816dfb..09d74b79c68d6 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
@@ -34,17 +34,16 @@ define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    # kill: def $ax killed $ax def $eax
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
-; NOSSE-NEXT:    shll $16, %eax
-; NOSSE-NEXT:    movzwl %si, %edi
-; NOSSE-NEXT:    orl %eax, %edi
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT:    movw %ax, 4(%edi)
+; NOSSE-NEXT:    shll $16, %esi
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movw %ax, 4(%esi)
-; NOSSE-NEXT:    movl %edi, (%esi)
-; NOSSE-NEXT:    movl %esi, %eax
+; NOSSE-NEXT:    movzwl %ax, %eax
+; NOSSE-NEXT:    orl %esi, %eax
+; NOSSE-NEXT:    movl %eax, (%edi)
+; NOSSE-NEXT:    movl %edi, %eax
 ; NOSSE-NEXT:    addl $4, %esp
 ; NOSSE-NEXT:    popl %esi
 ; NOSSE-NEXT:    popl %edi
@@ -62,17 +61,16 @@ define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    # kill: def $ax killed $ax def $eax
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
-; SSE-NEXT:    shll $16, %eax
-; SSE-NEXT:    movzwl %si, %edi
-; SSE-NEXT:    orl %eax, %edi
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; SSE-NEXT:    movw %ax, 4(%edi)
+; SSE-NEXT:    shll $16, %esi
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movw %ax, 4(%esi)
-; SSE-NEXT:    movl %edi, (%esi)
-; SSE-NEXT:    movl %esi, %eax
+; SSE-NEXT:    movzwl %ax, %eax
+; SSE-NEXT:    orl %esi, %eax
+; SSE-NEXT:    movl %eax, (%edi)
+; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    addl $4, %esp
 ; SSE-NEXT:    popl %esi
 ; SSE-NEXT:    popl %edi
@@ -83,11 +81,10 @@ define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
 define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
 ; NOSSE-LABEL: return_arg_v4bf16:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    pushl %ebp
 ; NOSSE-NEXT:    pushl %ebx
 ; NOSSE-NEXT:    pushl %edi
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    subl $12, %esp
+; NOSSE-NEXT:    subl $16, %esp
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -99,30 +96,27 @@ define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movl %eax, %ebx
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movw %ax, 6(%ebx)
+; NOSSE-NEXT:    movw %di, 4(%ebx)
+; NOSSE-NEXT:    movw %si, 2(%ebx)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movw %ax, 6(%ebp)
-; NOSSE-NEXT:    movw %bx, 4(%ebp)
-; NOSSE-NEXT:    movw %di, 2(%ebp)
-; NOSSE-NEXT:    movw %si, (%ebp)
-; NOSSE-NEXT:    movl %ebp, %eax
-; NOSSE-NEXT:    addl $12, %esp
+; NOSSE-NEXT:    movw %ax, (%ebx)
+; NOSSE-NEXT:    movl %ebx, %eax
+; NOSSE-NEXT:    addl $16, %esp
 ; NOSSE-NEXT:    popl %esi
 ; NOSSE-NEXT:    popl %edi
 ; NOSSE-NEXT:    popl %ebx
-; NOSSE-NEXT:    popl %ebp
 ; NOSSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: return_arg_v4bf16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushl %ebp
 ; SSE-NEXT:    pushl %ebx
 ; SSE-NEXT:    pushl %edi
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    subl $12, %esp
+; SSE-NEXT:    subl $16, %esp
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -134,21 +128,19 @@ define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movl %eax, %ebx
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movw %ax, 6(%ebx)
+; SSE-NEXT:    movw %di, 4(%ebx)
+; SSE-NEXT:    movw %si, 2(%ebx)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movw %ax, 6(%ebp)
-; SSE-NEXT:    movw %bx, 4(%ebp)
-; SSE-NEXT:    movw %di, 2(%ebp)
-; SSE-NEXT:    movw %si, (%ebp)
-; SSE-NEXT:    movl %ebp, %eax
-; SSE-NEXT:    addl $12, %esp
+; SSE-NEXT:    movw %ax, (%ebx)
+; SSE-NEXT:    movl %ebx, %eax
+; SSE-NEXT:    addl $16, %esp
 ; SSE-NEXT:    popl %esi
 ; SSE-NEXT:    popl %edi
 ; SSE-NEXT:    popl %ebx
-; SSE-NEXT:    popl %ebp
 ; SSE-NEXT:    retl $4
   ret <4 x bfloat> %x
 }
@@ -176,7 +168,7 @@ define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    movl %eax, %ebp
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -188,24 +180,22 @@ define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movl %eax, %ebx
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movw %ax, 14(%ebp)
-; NOSSE-NEXT:    movw %bx, 12(%ebp)
-; NOSSE-NEXT:    movw %di, 10(%ebp)
-; NOSSE-NEXT:    movw %si, 8(%ebp)
-; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; NOSSE-NEXT:    movw %ax, 6(%ebp)
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; NOSSE-NEXT:    movw %ax, 14(%ebx)
+; NOSSE-NEXT:    movw %di, 12(%ebx)
+; NOSSE-NEXT:    movw %si, 10(%ebx)
+; NOSSE-NEXT:    movw %bp, 8(%ebx)
 ; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; NOSSE-NEXT:    movw %ax, 4(%ebp)
+; NOSSE-NEXT:    movw %ax, 6(%ebx)
 ; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; NOSSE-NEXT:    movw %ax, 2(%ebp)
+; NOSSE-NEXT:    movw %ax, 4(%ebx)
 ; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; NOSSE-NEXT:    movw %ax, (%ebp)
-; NOSSE-NEXT:    movl %ebp, %eax
+; NOSSE-NEXT:    movw %ax, 2(%ebx)
+; NOSSE-NEXT:    calll __truncsfbf2
+; NOSSE-NEXT:    movw %ax, (%ebx)
+; NOSSE-NEXT:    movl %ebx, %eax
 ; NOSSE-NEXT:    addl $12, %esp
 ; NOSSE-NEXT:    popl %esi
 ; NOSSE-NEXT:    popl %edi
@@ -235,7 +225,7 @@ define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movl %eax, %ebp
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -247,24 +237,22 @@ define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movl %eax, %ebx
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movw %ax, 14(%ebp)
-; SSE-NEXT:    movw %bx, 12(%ebp)
-; SSE-NEXT:    movw %di, 10(%ebp)
-; SSE-NEXT:    movw %si, 8(%ebp)
-; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; SSE-NEXT:    movw %ax, 6(%ebp)
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SSE-NEXT:    movw %ax, 14(%ebx)
+; SSE-NEXT:    movw %di, 12(%ebx)
+; SSE-NEXT:    movw %si, 10(%ebx)
+; SSE-NEXT:    movw %bp, 8(%ebx)
 ; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; SSE-NEXT:    movw %ax, 4(%ebp)
+; SSE-NEXT:    movw %ax, 6(%ebx)
 ; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; SSE-NEXT:    movw %ax, 2(%ebp)
+; SSE-NEXT:    movw %ax, 4(%ebx)
 ; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
-; SSE-NEXT:    movw %ax, (%ebp)
-; SSE-NEXT:    movl %ebp, %eax
+; SSE-NEXT:    movw %ax, 2(%ebx)
+; SSE-NEXT:    calll __truncsfbf2
+; SSE-NEXT:    movw %ax, (%ebx)
+; SSE-NEXT:    movl %ebx, %eax
 ; SSE-NEXT:    addl $12, %esp
 ; SSE-NEXT:    popl %esi
 ; SSE-NEXT:    popl %edi
@@ -329,7 +317,7 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT:    movl %eax, %ebp
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
@@ -341,15 +329,13 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movl %eax, %ebp
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; NOSSE-NEXT:    calll __truncsfbf2
 ; NOSSE-NEXT:    movw %ax, 30(%edi)
-; NOSSE-NEXT:    movw %bp, 28(%edi)
-; NOSSE-NEXT:    movw %bx, 26(%edi)
-; NOSSE-NEXT:    movw %si, 24(%edi)
+; NOSSE-NEXT:    movw %bx, 28(%edi)
+; NOSSE-NEXT:    movw %si, 26(%edi)
+; NOSSE-NEXT:    movw %bp, 24(%edi)
 ; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
 ; NOSSE-NEXT:    movw %ax, 22(%edi)
 ; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
@@ -372,7 +358,7 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; NOSSE-NEXT:    movw %ax, 4(%edi)
 ; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
 ; NOSSE-NEXT:    movw %ax, 2(%edi)
-; NOSSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT:    calll __truncsfbf2
 ; NOSSE-NEXT:    movw %ax, (%edi)
 ; NOSSE-NEXT:    movl %edi, %eax
 ; NOSSE-NEXT:    addl $28, %esp
@@ -436,7 +422,7 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT:    movl %eax, %ebp
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
@@ -448,15 +434,13 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movl %eax, %ebp
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SSE-NEXT:    calll __truncsfbf2
 ; SSE-NEXT:    movw %ax, 30(%edi)
-; SSE-NEXT:    movw %bp, 28(%edi)
-; SSE-NEXT:    movw %bx, 26(%edi)
-; SSE-NEXT:    movw %si, 24(%edi)
+; SSE-NEXT:    movw %bx, 28(%edi)
+; SSE-NEXT:    movw %si, 26(%edi)
+; SSE-NEXT:    movw %bp, 24(%edi)
 ; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
 ; SSE-NEXT:    movw %ax, 22(%edi)
 ; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
@@ -479,7 +463,7 @@ define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
 ; SSE-NEXT:    movw %ax, 4(%edi)
 ; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
 ; SSE-NEXT:    movw %ax, 2(%edi)
-; SSE-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT:    calll __truncsfbf2
 ; SSE-NEXT:    movw %ax, (%edi)
 ; SSE-NEXT:    movl %edi, %eax
 ; SSE-NEXT:    addl $28, %esp
@@ -543,14 +527,13 @@ define void @call_ret_bf16(ptr %ptr) #0 {
 define void @call_ret_v2bf16(ptr %ptr) #0 {
 ; NOSSE-LABEL: call_ret_v2bf16:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    pushl %edi
 ; NOSSE-NEXT:    pushl %esi
-; NOSSE-NEXT:    subl $20, %esp
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; NOSSE-NEXT:    movzwl 2(%edi), %eax
+; NOSSE-NEXT:    subl $24, %esp
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT:    movzwl 2(%esi), %eax
 ; NOSSE-NEXT:    shll $16, %eax
 ; NOSSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; NOSSE-NEXT:    movl (%edi), %eax
+; NOSSE-NEXT:    movl (%esi), %eax
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    shll $16, %eax
@@ -558,31 +541,27 @@ define void @call_ret_v2bf16(ptr %ptr) #0 {
 ; NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll returns_v2bf16 at PLT
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; NOSSE-NEXT:    fstps (%esp)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movl %eax, %esi
 ; NOSSE-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; NOSSE-NEXT:    fstps (%esp)
+; NOSSE-NEXT:    movw %ax, 2(%esi)
 ; NOSSE-NEXT:    calll __truncsfbf2
-; NOSSE-NEXT:    movw %ax, 2(%edi)
-; NOSSE-NEXT:    movw %si, (%edi)
-; NOSSE-NEXT:    addl $20, %esp
+; NOSSE-NEXT:    movw %ax, (%esi)
+; NOSSE-NEXT:    addl $24, %esp
 ; NOSSE-NEXT:    popl %esi
-; NOSSE-NEXT:    popl %edi
 ; NOSSE-NEXT:    retl
 ;
 ; SSE-LABEL: call_ret_v2bf16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushl %edi
 ; SSE-NEXT:    pushl %esi
-; SSE-NEXT:    subl $36, %esp
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SSE-NEXT:    movzwl 2(%edi), %eax
+; SSE-NEXT:    subl $40, %esp
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT:    movzwl 2(%esi), %eax
 ; SSE-NEXT:    shll $16, %eax
 ; SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; SSE-NEXT:    movl (%edi), %eax
+; SSE-NEXT:    movl (%esi), %eax
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; SSE-NEXT:    shll $16, %eax
@@ -590,19 +569,16 @@ define void @call_ret_v2bf16(ptr %ptr) #0 {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    calll returns_v2bf16 at PLT
-; SSE-NEXT:    fxch %st(1)
 ; SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; SSE-NEXT:    fstps (%esp)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movl %eax, %esi
 ; SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; SSE-NEXT:    fstps (%esp)
+; SSE-NEXT:    movw %ax, 2(%esi)
 ; SSE-NEXT:    calll __truncsfbf2
-; SSE-NEXT:    movw %ax, 2(%edi)
-; SSE-NEXT:    movw %si, (%edi)
-; SSE-NEXT:    addl $36, %esp
+; SSE-NEXT:    movw %ax, (%esi)
+; SSE-NEXT:    addl $40, %esp
 ; SSE-NEXT:    popl %esi
-; SSE-NEXT:    popl %edi
 ; SSE-NEXT:    retl
   %val = load <2 x bfloat>, ptr %ptr
   %bf16 = call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val)
@@ -714,8 +690,8 @@ define void @call_ret_v4bf16(ptr %ptr) #0 {
 ; NOSSE-NEXT:    subl $4, %esp
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; NOSSE-NEXT:    movw %dx, 6(%esi)
+; NOSSE-NEXT:    movw %cx, 6(%esi)
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; NOSSE-NEXT:    movw %cx, 4(%esi)
 ; NOSSE-NEXT:    movl %eax, (%esi)
 ; NOSSE-NEXT:    addl $48, %esp
@@ -757,8 +733,8 @@ define void @call_ret_v4bf16(ptr %ptr) #0 {
 ; SSE-NEXT:    subl $4, %esp
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; SSE-NEXT:    movw %dx, 6(%esi)
+; SSE-NEXT:    movw %cx, 6(%esi)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-NEXT:    movw %cx, 4(%esi)
 ; SSE-NEXT:    movl %eax, (%esi)
 ; SSE-NEXT:    addl $48, %esp
@@ -832,20 +808,18 @@ define void @call_ret_v8bf16(ptr %ptr) #0 {
 ; NOSSE-NEXT:    calll returns_v8bf16 at PLT
 ; NOSSE-NEXT:    subl $4, %esp
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
-; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
-; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
-; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    movw %ax, 14(%esi)
-; NOSSE-NEXT:    movw %bp, 12(%esi)
-; NOSSE-NEXT:    movw %bx, 10(%esi)
-; NOSSE-NEXT:    movw %di, 8(%esi)
-; NOSSE-NEXT:    movw %dx, 6(%esi)
+; NOSSE-NEXT:    movw %cx, 14(%esi)
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movw %cx, 12(%esi)
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movw %cx, 10(%esi)
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movw %cx, 8(%esi)
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NEXT:    movw %cx, 6(%esi)
+; NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; NOSSE-NEXT:    movw %cx, 4(%esi)
-; NOSSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; NOSSE-NEXT:    movl %eax, (%esi)
 ; NOSSE-NEXT:    addl $108, %esp
 ; NOSSE-NEXT:    popl %esi
@@ -913,20 +887,18 @@ define void @call_ret_v8bf16(ptr %ptr) #0 {
 ; SSE-NEXT:    calll returns_v8bf16 at PLT
 ; SSE-NEXT:    subl $4, %esp
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
-; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
-; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
-; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    movw %ax, 14(%esi)
-; SSE-NEXT:    movw %bp, 12(%esi)
-; SSE-NEXT:    movw %bx, 10(%esi)
-; SSE-NEXT:    movw %di, 8(%esi)
-; SSE-NEXT:    movw %dx, 6(%esi)
+; SSE-NEXT:    movw %cx, 14(%esi)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movw %cx, 12(%esi)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movw %cx, 10(%esi)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movw %cx, 8(%esi)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    movw %cx, 6(%esi)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; SSE-NEXT:    movw %cx, 4(%esi)
-; SSE-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; SSE-NEXT:    movl %eax, (%esi)
 ; SSE-NEXT:    addl $108, %esp
 ; SSE-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 7bccd6ba088ac..388810cf64c84 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -9,16 +9,16 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; X86-LABEL: add:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl (%edx), %edx
-; X86-NEXT:    shll $16, %edx
-; X86-NEXT:    vmovd %edx, %xmm0
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm1
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -140,36 +140,30 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
 define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
 ; X86-LABEL: add_double:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    calll __truncdfbf2
-; X86-NEXT:    vmovw %xmm0, %edi
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmovsd %xmm1, (%esp)
+; X86-NEXT:    vmovw %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    calll __truncdfbf2
 ; X86-NEXT:    vmovw %xmm0, %eax
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    vmovd %eax, %xmm0
-; X86-NEXT:    shll $16, %edi
-; X86-NEXT:    vmovd %edi, %xmm1
-; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vaddss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
 ; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
 ; X86-NEXT:    vmovw %xmm0, %eax
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; X86-NEXT:    vmovsd %xmm0, (%esi)
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsd %xmm0, (%eax)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; SSE2-LABEL: add_double:
@@ -300,21 +294,20 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
 define double @add_double2(double %da, double %db) nounwind {
 ; X86-LABEL: add_double2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $28, %esp
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    calll __truncdfbf2
-; X86-NEXT:    vmovw %xmm0, %esi
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmovsd %xmm1, (%esp)
+; X86-NEXT:    vmovw %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    calll __truncdfbf2
 ; X86-NEXT:    vmovw %xmm0, %eax
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    vmovd %eax, %xmm0
-; X86-NEXT:    shll $16, %esi
-; X86-NEXT:    vmovd %esi, %xmm1
-; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vaddss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
 ; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
 ; X86-NEXT:    vmovw %xmm0, %eax
 ; X86-NEXT:    shll $16, %eax
@@ -322,8 +315,7 @@ define double @add_double2(double %da, double %db) nounwind {
 ; X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; SSE2-LABEL: add_double2:
@@ -436,12 +428,12 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
 ; X86-LABEL: add_constant:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -554,9 +546,9 @@ define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
 ; X86-LABEL: fold_ext_trunc:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fold_ext_trunc:
@@ -872,7 +864,7 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    retl
 ;
 ; SSE2-LABEL: pr62997:
@@ -1654,16 +1646,12 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $204, %esp
 ; X86-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovhps %xmm1, (%esp)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll __truncdfbf2
 ; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovhps %xmm0, (%esp)
-; X86-NEXT:    calll __truncdfbf2
-; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
 ; X86-NEXT:    vmovlps %xmm0, (%esp)
 ; X86-NEXT:    vzeroupper
@@ -1695,20 +1683,26 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
 ; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vmovhps %xmm0, (%esp)
 ; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vmovlps %xmm1, (%esp)
 ; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
 ; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X86-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncdfbf2
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; X86-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    vpunpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; X86-NEXT:    addl $204, %esp
 ; X86-NEXT:    retl
 ;
@@ -2174,14 +2168,14 @@ define void @PR92471(ptr %0, ptr %1) nounwind {
 ; X86-LABEL: PR92471:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vpinsrd $1, 4(%ecx), %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $2, 8(%ecx), %xmm0, %xmm0
-; X86-NEXT:    vpinsrw $6, 12(%ecx), %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $1, 4(%eax), %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $2, 8(%eax), %xmm0, %xmm0
+; X86-NEXT:    vpinsrw $6, 12(%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X86-NEXT:    vpslld $16, %ymm0, %ymm0
 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrd $2, %xmm1, 24(%eax)
 ; X86-NEXT:    vpextrd $1, %xmm1, 20(%eax)
 ; X86-NEXT:    vmovd %xmm1, 16(%eax)
diff --git a/llvm/test/CodeGen/X86/bitcast-mmx.ll b/llvm/test/CodeGen/X86/bitcast-mmx.ll
index fe48a96a51d3e..a3d00471fab2c 100644
--- a/llvm/test/CodeGen/X86/bitcast-mmx.ll
+++ b/llvm/test/CodeGen/X86/bitcast-mmx.ll
@@ -34,10 +34,10 @@ define i64 @t1(i64 %x, i32 %n) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movq 8(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    psllq %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd 16(%ebp), %mm0
+; X86-NEXT:    movq 8(%ebp), %mm1
+; X86-NEXT:    psllq %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -65,11 +65,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    psllq %mm1, %mm0
-; X86-NEXT:    por 8(%ebp), %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd 16(%ebp), %mm0
+; X86-NEXT:    movd 20(%ebp), %mm1
+; X86-NEXT:    psllq %mm0, %mm1
+; X86-NEXT:    por 8(%ebp), %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -104,11 +104,11 @@ define i64 @t3(ptr %y, ptr %n) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psllq %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psllq %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 5020c8af5c8aa..79d28930b0158 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -13,25 +13,25 @@ declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
 define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
 ; X86-LABEL: test_bitreverse_v2i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
-; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $13107, %eax # imm = 0x3333
-; X86-NEXT:    leal (%eax,%edx,4), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $21845, %eax # imm = 0x5555
-; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    rolw $8, %cx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
@@ -497,19 +497,19 @@ declare i4 @llvm.bitreverse.i4(i4) readnone
 define i4 @test_bitreverse_i4(i4 %a) {
 ; X86-LABEL: test_bitreverse_i4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $8, %al
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    addb %cl, %dl
-; X86-NEXT:    andb $4, %dl
-; X86-NEXT:    movb %cl, %ah
-; X86-NEXT:    shlb $3, %ah
-; X86-NEXT:    andb $8, %ah
-; X86-NEXT:    orb %dl, %ah
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    andb $4, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shlb $3, %dl
+; X86-NEXT:    andb $8, %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shrb %cl
 ; X86-NEXT:    andb $2, %cl
-; X86-NEXT:    orb %ah, %cl
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    andb $8, %al
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    retl
@@ -697,89 +697,88 @@ define i128 @test_bitreverse_i128(i128 %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%edx,2), %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
 ; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    leal (%ecx,%edi,4), %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    leal (%ecx,%edi,2), %ecx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edi
-; X86-NEXT:    shrl $4, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    leal (%esi,%edi,4), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    leal (%esi,%edi,2), %esi
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
 ; X86-NEXT:    shrl $4, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    leal (%edx,%edi,4), %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
 ; X86-NEXT:    shrl %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    leal (%edx,%edi,2), %edx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edi
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%edi,4), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%edi,2), %edi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -830,9 +829,9 @@ define i128 @test_bitreverse_i128(i128 %a) nounwind {
 ; X86XOP-NEXT:    movl %esp, %ebp
 ; X86XOP-NEXT:    andl $-16, %esp
 ; X86XOP-NEXT:    subl $16, %esp
-; X86XOP-NEXT:    movl 8(%ebp), %eax
 ; X86XOP-NEXT:    vpshufd {{.*#+}} xmm0 = mem[2,3,0,1]
 ; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    movl 8(%ebp), %eax
 ; X86XOP-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86XOP-NEXT:    movl %ebp, %esp
 ; X86XOP-NEXT:    popl %ebp
@@ -844,10 +843,10 @@ define i128 @test_bitreverse_i128(i128 %a) nounwind {
 ; X86GFNI-NEXT:    movl %esp, %ebp
 ; X86GFNI-NEXT:    andl $-16, %esp
 ; X86GFNI-NEXT:    subl $16, %esp
-; X86GFNI-NEXT:    movl 8(%ebp), %eax
 ; X86GFNI-NEXT:    vmovdqu 24(%ebp), %xmm0
 ; X86GFNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
+; X86GFNI-NEXT:    movl 8(%ebp), %eax
 ; X86GFNI-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86GFNI-NEXT:    movl %ebp, %esp
 ; X86GFNI-NEXT:    popl %ebp
@@ -870,173 +869,161 @@ define i128 @test_bitreverse_i128(i128 %a) nounwind {
 define i256 @test_bitreverse_i256(i256 %a) nounwind {
 ; X86-LABEL: test_bitreverse_i256:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%edx,2), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ebp
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
 ; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    leal (%ecx,%ebp,4), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl $1431655765, %ebp # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    leal (%ecx,%ebp,2), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
 ; X86-NEXT:    shrl $4, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    leal (%edx,%ecx,4), %ecx
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    leal (%ecx,%edx,2), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    leal (%esi,%ecx,4), %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    shrl %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    leal (%ecx,%esi,2), %esi
-; X86-NEXT:    bswapl %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %ebx
-; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %ebx
-; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT:    leal (%ebx,%ecx,4), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NEXT:    shrl %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    leal (%ecx,%ebx,2), %ebx
-; X86-NEXT:    bswapl %edi
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    leal (%edi,%ecx,4), %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    leal (%ecx,%edi,2), %edi
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebp, 20(%eax)
-; X86-NEXT:    movl %edi, 16(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_bitreverse_i256:
@@ -1121,24 +1108,24 @@ define i256 @test_bitreverse_i256(i256 %a) nounwind {
 ;
 ; X86XOP-LABEL: test_bitreverse_i256:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
 ; X86XOP-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
-; X86XOP-NEXT:    vpperm %xmm1, %xmm0, %xmm0, %xmm2
-; X86XOP-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86XOP-NEXT:    vpperm %xmm1, %xmm0, %xmm0, %xmm0
-; X86XOP-NEXT:    vmovdqa %xmm0, 16(%eax)
-; X86XOP-NEXT:    vmovdqa %xmm2, (%eax)
+; X86XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; X86XOP-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86XOP-NEXT:    vmovdqa %xmm1, 16(%eax)
+; X86XOP-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86XOP-NEXT:    vzeroupper
 ; X86XOP-NEXT:    retl $4
 ;
 ; X86GFNI-LABEL: test_bitreverse_i256:
 ; X86GFNI:       # %bb.0:
-; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86GFNI-NEXT:    vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
 ; X86GFNI-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
 ; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0
+; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86GFNI-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86GFNI-NEXT:    vzeroupper
 ; X86GFNI-NEXT:    retl $4
@@ -1165,71 +1152,48 @@ define i256 @test_bitreverse_i256(i256 %a) nounwind {
 define i512 @test_bitreverse_i512(i512 %a) nounwind {
 ; X86-LABEL: test_bitreverse_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%edx,2), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 60(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 56(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    bswapl %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ebp
-; X86-NEXT:    shrl $4, %ebx
-; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ebp, %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
-; X86-NEXT:    shrl $2, %ebx
-; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT:    leal (%ebx,%ebp,4), %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    andl $1431655765, %ebp # imm = 0x55555555
-; X86-NEXT:    shrl %ebx
-; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NEXT:    leal (%ebx,%ebp,2), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bswapl %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ebx
-; X86-NEXT:    shrl $4, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    leal (%edi,%ebx,4), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NEXT:    shrl %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    leal (%edi,%ebx,2), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edi
-; X86-NEXT:    shrl $4, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    leal (%esi,%edi,4), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    leal (%esi,%edi,2), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
@@ -1247,7 +1211,7 @@ define i512 @test_bitreverse_i512(i512 %a) nounwind {
 ; X86-NEXT:    shrl %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    leal (%edx,%esi,2), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, 52(%eax)
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
@@ -1265,241 +1229,236 @@ define i512 @test_bitreverse_i512(i512 %a) nounwind {
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    leal (%ecx,%edx,2), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 44(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 36(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 28(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %edx, 56(%eax)
-; X86-NEXT:    movl %esi, 52(%eax)
-; X86-NEXT:    movl %edi, 48(%eax)
-; X86-NEXT:    movl %ebx, 44(%eax)
-; X86-NEXT:    movl %ebp, 40(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $40, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_bitreverse_i512:
@@ -1667,32 +1626,32 @@ define i512 @test_bitreverse_i512(i512 %a) nounwind {
 ;
 ; X86XOP-LABEL: test_bitreverse_i512:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
 ; X86XOP-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
-; X86XOP-NEXT:    vpperm %xmm1, %xmm0, %xmm0, %xmm2
-; X86XOP-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86XOP-NEXT:    vpperm %xmm1, %xmm0, %xmm0, %xmm0
-; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3,0,1]
-; X86XOP-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[1,0,3,2]
-; X86XOP-NEXT:    vpperm %xmm1, %xmm3, %xmm0, %xmm4
-; X86XOP-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; X86XOP-NEXT:    vpperm %xmm1, %xmm3, %xmm0, %xmm1
+; X86XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; X86XOP-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86XOP-NEXT:    vmovdqa %xmm1, 48(%eax)
-; X86XOP-NEXT:    vmovdqa %xmm4, 32(%eax)
-; X86XOP-NEXT:    vmovdqa %xmm0, 16(%eax)
-; X86XOP-NEXT:    vmovdqa %xmm2, (%eax)
+; X86XOP-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovdqa %xmm0, 32(%eax)
+; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86XOP-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X86XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86XOP-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovdqa %xmm1, 16(%eax)
+; X86XOP-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86XOP-NEXT:    vzeroupper
 ; X86XOP-NEXT:    retl $4
 ;
 ; X86GFNI-LABEL: test_bitreverse_i512:
 ; X86GFNI:       # %bb.0:
-; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86GFNI-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
 ; X86GFNI-NEXT:    vpermq {{[0-9]+}}(%esp), %zmm0, %zmm0
 ; X86GFNI-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
 ; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
+; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86GFNI-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; X86GFNI-NEXT:    vzeroupper
 ; X86GFNI-NEXT:    retl $4
@@ -1719,96 +1678,20 @@ define i512 @test_bitreverse_i512(i512 %a) nounwind {
 ; X64GFNI-NEXT:    vzeroupper
 ; X64GFNI-NEXT:    retq
   %b = call i512 @llvm.bitreverse.i512(i512 %a)
-  ret i512 %b
-}
-
-; Make sure we don't assert during type legalization promoting a large
-; bitreverse due to the need for a large shift that won't fit in the i8 returned
-; from getShiftAmountTy.
-define i528 @large_promotion(i528 %A) nounwind {
-; X86-LABEL: large_promotion:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $60, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    bswapl %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ebp
-; X86-NEXT:    shrl $4, %ebx
-; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ebp, %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
-; X86-NEXT:    shrl $2, %ebx
-; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT:    leal (%ebx,%ebp,4), %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    andl $1431633920, %ebp # imm = 0x55550000
-; X86-NEXT:    shrl %ebx
-; X86-NEXT:    andl $1431633920, %ebx # imm = 0x55550000
-; X86-NEXT:    leal (%ebx,%ebp,2), %ebp
-; X86-NEXT:    bswapl %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ebx
-; X86-NEXT:    shrl $4, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    leal (%edi,%ebx,4), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NEXT:    shrl %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    leal (%edi,%ebx,2), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edi
-; X86-NEXT:    shrl $4, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    leal (%esi,%edi,4), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    leal (%esi,%edi,2), %ebx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %esi
-; X86-NEXT:    shrl $4, %edx
-; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %edx
-; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    leal (%edx,%esi,4), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    leal (%edx,%esi,2), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+  ret i512 %b
+}
+
+; Make sure we don't assert during type legalization promoting a large
+; bitreverse due to the need for a large shift that won't fit in the i8 returned
+; from getShiftAmountTy.
+define i528 @large_promotion(i528 %A) nounwind {
+; X86-LABEL: large_promotion:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
@@ -1825,216 +1708,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    leal (%ecx,%edx,2), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    shrl $4, %eax
-; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%ecx,%edx,2), %edx
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
@@ -2051,84 +1725,316 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrdl $16, %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $16, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $16, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $16, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl $16, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $16, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ebp, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shrdl $16, %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %edi
-; X86-NEXT:    shrdl $16, %edx, %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shldl $16, %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %edi, 56(%eax)
-; X86-NEXT:    movl %ebx, 52(%eax)
-; X86-NEXT:    movl %ebp, 48(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, 60(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %edx
+; X86-NEXT:    movl %edx, 56(%eax)
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    leal (%esi,%edx,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %esi
+; X86-NEXT:    shldl $16, %esi, %edi
+; X86-NEXT:    movl %edi, 52(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %esi
+; X86-NEXT:    movl %esi, 48(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %esi
+; X86-NEXT:    shldl $16, %esi, %edi
+; X86-NEXT:    movl %edi, 44(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %esi
+; X86-NEXT:    movl %esi, 40(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %esi
+; X86-NEXT:    shldl $16, %esi, %edi
+; X86-NEXT:    movl %edi, 36(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %esi
+; X86-NEXT:    movl %esi, 32(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %esi
+; X86-NEXT:    shldl $16, %esi, %edi
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %esi
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %esi
+; X86-NEXT:    shldl $16, %esi, %edi
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %esi
+; X86-NEXT:    shldl $16, %esi, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shldl $16, %edi, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    shldl $16, %edx, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    leal (%esi,%edi,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $1431633920, %edi # imm = 0x55550000
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    andl $1431633920, %esi # imm = 0x55550000
+; X86-NEXT:    leal (%esi,%edi,2), %esi
+; X86-NEXT:    shrdl $16, %edx, %esi
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    shrl $16, %edx
-; X86-NEXT:    movw %dx, 64(%eax)
-; X86-NEXT:    addl $60, %esp
+; X86-NEXT:    shrl $16, %ecx
+; X86-NEXT:    movw %cx, 64(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: large_promotion:
@@ -2327,112 +2233,81 @@ define i528 @large_promotion(i528 %A) nounwind {
 ;
 ; X86XOP-LABEL: large_promotion:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    pushl %ebp
-; X86XOP-NEXT:    pushl %ebx
-; X86XOP-NEXT:    pushl %edi
 ; X86XOP-NEXT:    pushl %esi
-; X86XOP-NEXT:    subl $44, %esp
+; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86XOP-NEXT:    vshufpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; X86XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm0 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
-; X86XOP-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; X86XOP-NEXT:    vpperm %xmm0, %xmm2, %xmm0, %xmm2
+; X86XOP-NEXT:    vpextrd $2, %xmm2, %edx
+; X86XOP-NEXT:    vpextrd $3, %xmm2, %ecx
+; X86XOP-NEXT:    movl %ecx, %esi
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86XOP-NEXT:    movl %esi, 60(%eax)
+; X86XOP-NEXT:    vpextrd $1, %xmm2, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 56(%eax)
+; X86XOP-NEXT:    vmovd %xmm2, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 52(%eax)
 ; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
-; X86XOP-NEXT:    vpextrd $3, %xmm1, %eax
+; X86XOP-NEXT:    vpextrd $3, %xmm1, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 48(%eax)
+; X86XOP-NEXT:    vpextrd $2, %xmm1, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 44(%eax)
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 40(%eax)
+; X86XOP-NEXT:    vmovd %xmm1, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 36(%eax)
 ; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3,0,1]
 ; X86XOP-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1,0,3,2]
-; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm2
-; X86XOP-NEXT:    vmovd %xmm2, %ecx
-; X86XOP-NEXT:    shrdl $16, %ecx, %eax
-; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $1, %xmm2, %eax
-; X86XOP-NEXT:    shrdl $16, %eax, %ecx
-; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $2, %xmm2, %ecx
-; X86XOP-NEXT:    shrdl $16, %ecx, %eax
-; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $3, %xmm2, %eax
-; X86XOP-NEXT:    shrdl $16, %eax, %ecx
-; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86XOP-NEXT:    vpperm %xmm0, %xmm2, %xmm0, %xmm2
+; X86XOP-NEXT:    vpextrd $3, %xmm2, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 32(%eax)
+; X86XOP-NEXT:    vpextrd $2, %xmm2, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 28(%eax)
+; X86XOP-NEXT:    vpextrd $1, %xmm2, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 24(%eax)
+; X86XOP-NEXT:    vmovd %xmm2, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 20(%eax)
 ; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
-; X86XOP-NEXT:    vmovd %xmm1, %ecx
-; X86XOP-NEXT:    shrdl $16, %ecx, %eax
-; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
-; X86XOP-NEXT:    shrdl $16, %eax, %ecx
-; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $2, %xmm1, %ecx
-; X86XOP-NEXT:    shrdl $16, %ecx, %eax
-; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $3, %xmm1, %eax
-; X86XOP-NEXT:    shrdl $16, %eax, %ecx
-; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3,0,1]
-; X86XOP-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1,0,3,2]
-; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm2
-; X86XOP-NEXT:    vmovd %xmm2, %ecx
-; X86XOP-NEXT:    shrdl $16, %ecx, %eax
-; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $1, %xmm2, %eax
-; X86XOP-NEXT:    shrdl $16, %eax, %ecx
-; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $2, %xmm2, %ebp
-; X86XOP-NEXT:    shrdl $16, %ebp, %eax
-; X86XOP-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86XOP-NEXT:    vpextrd $3, %xmm2, %edi
-; X86XOP-NEXT:    shrdl $16, %edi, %ebp
-; X86XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86XOP-NEXT:    vpextrd $3, %xmm1, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 16(%eax)
+; X86XOP-NEXT:    vpextrd $2, %xmm1, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 12(%eax)
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %esi
+; X86XOP-NEXT:    shldl $16, %esi, %edx
+; X86XOP-NEXT:    movl %edx, 8(%eax)
+; X86XOP-NEXT:    vmovd %xmm1, %edx
+; X86XOP-NEXT:    shldl $16, %edx, %esi
+; X86XOP-NEXT:    movl %esi, 4(%eax)
+; X86XOP-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
 ; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
-; X86XOP-NEXT:    vmovd %xmm0, %esi
-; X86XOP-NEXT:    shrdl $16, %esi, %edi
-; X86XOP-NEXT:    vpextrd $1, %xmm0, %edx
+; X86XOP-NEXT:    vpextrd $3, %xmm0, %esi
 ; X86XOP-NEXT:    shrdl $16, %edx, %esi
-; X86XOP-NEXT:    vpextrd $2, %xmm0, %ecx
-; X86XOP-NEXT:    shrdl $16, %ecx, %edx
-; X86XOP-NEXT:    vpextrd $3, %xmm0, %ebx
-; X86XOP-NEXT:    shrdl $16, %ebx, %ecx
-; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86XOP-NEXT:    movl %ecx, 60(%eax)
-; X86XOP-NEXT:    movl %edx, 56(%eax)
-; X86XOP-NEXT:    movl %esi, 52(%eax)
-; X86XOP-NEXT:    movl %edi, 48(%eax)
-; X86XOP-NEXT:    movl %ebp, 44(%eax)
-; X86XOP-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 40(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 36(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 32(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 28(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 24(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 20(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 16(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 12(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 8(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, 4(%eax)
-; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86XOP-NEXT:    movl %ecx, (%eax)
-; X86XOP-NEXT:    shrl $16, %ebx
-; X86XOP-NEXT:    movw %bx, 64(%eax)
-; X86XOP-NEXT:    addl $44, %esp
+; X86XOP-NEXT:    movl %esi, (%eax)
+; X86XOP-NEXT:    shrl $16, %ecx
+; X86XOP-NEXT:    movw %cx, 64(%eax)
 ; X86XOP-NEXT:    popl %esi
-; X86XOP-NEXT:    popl %edi
-; X86XOP-NEXT:    popl %ebx
-; X86XOP-NEXT:    popl %ebp
 ; X86XOP-NEXT:    vzeroupper
 ; X86XOP-NEXT:    retl $4
 ;
 ; X86GFNI-LABEL: large_promotion:
 ; X86GFNI:       # %bb.0:
-; X86GFNI-NEXT:    pushl %ebx
 ; X86GFNI-NEXT:    pushl %edi
 ; X86GFNI-NEXT:    pushl %esi
-; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86GFNI-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
 ; X86GFNI-NEXT:    vpermq {{[0-9]+}}(%esp), %zmm0, %zmm1
 ; X86GFNI-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
@@ -2441,56 +2316,60 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86GFNI-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
 ; X86GFNI-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm2, %zmm2
 ; X86GFNI-NEXT:    vextracti32x4 $3, %zmm2, %xmm3
-; X86GFNI-NEXT:    vmovd %xmm3, %ecx
-; X86GFNI-NEXT:    vpextrd $1, %xmm3, %edx
-; X86GFNI-NEXT:    shrdl $16, %edx, %ecx
+; X86GFNI-NEXT:    vpextrd $3, %xmm3, %ecx
+; X86GFNI-NEXT:    movl %ecx, %edx
+; X86GFNI-NEXT:    shrl $16, %edx
+; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86GFNI-NEXT:    movw %dx, 64(%eax)
+; X86GFNI-NEXT:    vmovd %xmm3, %edx
+; X86GFNI-NEXT:    vpextrd $1, %xmm3, %esi
+; X86GFNI-NEXT:    shrdl $16, %esi, %edx
 ; X86GFNI-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [23,24,23,26,23,28,23,30]
 ; X86GFNI-NEXT:    vpermw %zmm2, %zmm4, %zmm4
-; X86GFNI-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
-; X86GFNI-NEXT:    vpextrd $2, %xmm3, %esi
-; X86GFNI-NEXT:    shrdl $16, %esi, %edx
-; X86GFNI-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; X86GFNI-NEXT:    vpextrd $3, %xmm3, %ecx
-; X86GFNI-NEXT:    shrdl $16, %ecx, %esi
-; X86GFNI-NEXT:    vpinsrd $3, %esi, %xmm4, %xmm3
+; X86GFNI-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; X86GFNI-NEXT:    vpextrd $2, %xmm3, %edx
+; X86GFNI-NEXT:    shrdl $16, %edx, %esi
+; X86GFNI-NEXT:    vpinsrd $2, %esi, %xmm4, %xmm3
+; X86GFNI-NEXT:    shldl $16, %edx, %ecx
+; X86GFNI-NEXT:    vpinsrd $3, %ecx, %xmm3, %xmm3
 ; X86GFNI-NEXT:    vextracti32x4 $2, %zmm2, %xmm4
-; X86GFNI-NEXT:    vmovd %xmm4, %edx
-; X86GFNI-NEXT:    vpextrd $1, %xmm4, %esi
-; X86GFNI-NEXT:    shrdl $16, %esi, %edx
+; X86GFNI-NEXT:    vmovd %xmm4, %ecx
+; X86GFNI-NEXT:    vpextrd $1, %xmm4, %edx
+; X86GFNI-NEXT:    shrdl $16, %edx, %ecx
 ; X86GFNI-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [15,16,15,18,15,20,15,22]
 ; X86GFNI-NEXT:    vpermw %zmm2, %zmm5, %zmm5
-; X86GFNI-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; X86GFNI-NEXT:    vpextrd $2, %xmm4, %edx
-; X86GFNI-NEXT:    shrdl $16, %edx, %esi
-; X86GFNI-NEXT:    vpinsrd $2, %esi, %xmm5, %xmm5
-; X86GFNI-NEXT:    vpextrd $3, %xmm4, %esi
-; X86GFNI-NEXT:    shldl $16, %edx, %esi
-; X86GFNI-NEXT:    vpinsrd $3, %esi, %xmm5, %xmm4
+; X86GFNI-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X86GFNI-NEXT:    vpextrd $2, %xmm4, %ecx
+; X86GFNI-NEXT:    shrdl $16, %ecx, %edx
+; X86GFNI-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X86GFNI-NEXT:    vpextrd $3, %xmm4, %edx
+; X86GFNI-NEXT:    shldl $16, %ecx, %edx
+; X86GFNI-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm4
 ; X86GFNI-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; X86GFNI-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; X86GFNI-NEXT:    vmovd %xmm4, %edx
-; X86GFNI-NEXT:    vpextrd $1, %xmm4, %esi
-; X86GFNI-NEXT:    shrdl $16, %esi, %edx
+; X86GFNI-NEXT:    vmovd %xmm4, %ecx
+; X86GFNI-NEXT:    vpextrd $1, %xmm4, %edx
+; X86GFNI-NEXT:    shrdl $16, %edx, %ecx
 ; X86GFNI-NEXT:    vpmovsxbw {{.*#+}} xmm5 = [7,8,7,10,7,12,7,14]
 ; X86GFNI-NEXT:    vpermw %ymm2, %ymm5, %ymm5
-; X86GFNI-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; X86GFNI-NEXT:    vpextrd $2, %xmm4, %edx
-; X86GFNI-NEXT:    shrdl $16, %edx, %esi
-; X86GFNI-NEXT:    vpinsrd $2, %esi, %xmm5, %xmm5
-; X86GFNI-NEXT:    vpextrd $3, %xmm4, %esi
-; X86GFNI-NEXT:    shldl $16, %edx, %esi
-; X86GFNI-NEXT:    vpinsrd $3, %esi, %xmm5, %xmm4
-; X86GFNI-NEXT:    vpextrd $1, %xmm2, %edx
-; X86GFNI-NEXT:    vpextrd $2, %xmm2, %esi
-; X86GFNI-NEXT:    vpextrd $3, %xmm2, %edi
-; X86GFNI-NEXT:    shldl $16, %esi, %edi
+; X86GFNI-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
+; X86GFNI-NEXT:    vpextrd $2, %xmm4, %ecx
+; X86GFNI-NEXT:    shrdl $16, %ecx, %edx
+; X86GFNI-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
+; X86GFNI-NEXT:    vpextrd $3, %xmm4, %edx
+; X86GFNI-NEXT:    shldl $16, %ecx, %edx
+; X86GFNI-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm4
+; X86GFNI-NEXT:    vpextrd $1, %xmm2, %ecx
+; X86GFNI-NEXT:    vpextrd $2, %xmm2, %edx
+; X86GFNI-NEXT:    vpextrd $3, %xmm2, %esi
 ; X86GFNI-NEXT:    shldl $16, %edx, %esi
-; X86GFNI-NEXT:    vmovd %xmm2, %ebx
-; X86GFNI-NEXT:    shrdl $16, %edx, %ebx
+; X86GFNI-NEXT:    shldl $16, %ecx, %edx
+; X86GFNI-NEXT:    vmovd %xmm2, %edi
+; X86GFNI-NEXT:    shrdl $16, %ecx, %edi
 ; X86GFNI-NEXT:    vpslld $16, %xmm2, %xmm2
-; X86GFNI-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
-; X86GFNI-NEXT:    vpinsrd $2, %esi, %xmm2, %xmm2
-; X86GFNI-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
+; X86GFNI-NEXT:    vpinsrd $1, %edi, %xmm2, %xmm2
+; X86GFNI-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
+; X86GFNI-NEXT:    vpinsrd $3, %esi, %xmm2, %xmm2
 ; X86GFNI-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
 ; X86GFNI-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; X86GFNI-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
@@ -2500,12 +2379,9 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86GFNI-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
 ; X86GFNI-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86GFNI-NEXT:    vpord %zmm2, %zmm0, %zmm0
-; X86GFNI-NEXT:    shrl $16, %ecx
-; X86GFNI-NEXT:    movw %cx, 64(%eax)
 ; X86GFNI-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; X86GFNI-NEXT:    popl %esi
 ; X86GFNI-NEXT:    popl %edi
-; X86GFNI-NEXT:    popl %ebx
 ; X86GFNI-NEXT:    vzeroupper
 ; X86GFNI-NEXT:    retl $4
 ;
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 33381313d3c19..d4fed39e88034 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -148,37 +148,36 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %esi
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    andl 56(%ebp), %ecx
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    movl 44(%ebp), %esi
-; X86-NEXT:    xorl %edi, %esi
-; X86-NEXT:    andl 60(%ebp), %esi
-; X86-NEXT:    xorl %edi, %esi
-; X86-NEXT:    movl 48(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    andl 64(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    movl 52(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    andl 68(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    andl 64(%ebp), %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    andl 60(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    andl 56(%ebp), %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 96ccc7b0f7527..347e4977fbcdb 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -44,9 +44,9 @@ define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    btcl %eax, %esi
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    btl %eax, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -78,9 +78,9 @@ define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    btrl %eax, %esi
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    btl %eax, %edx
 ; X86-NEXT:    setae %al
-; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -113,9 +113,9 @@ define i1 @set_ne_i32(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    btsl %eax, %esi
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    btl %eax, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -143,17 +143,17 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%edx), %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    btrl %ecx, %edi
 ; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx)
 ; X86-NEXT:    btl %ecx, %esi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    movl %edi, (%edx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -204,12 +204,12 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $32, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $32, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%ecx), %ecx
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
@@ -234,16 +234,17 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $32, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btcl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -274,16 +275,17 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $32, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btrl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -315,16 +317,17 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $32, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btsl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -356,20 +359,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $32, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%eax), %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    btrl %ecx, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%edx,%esi)
+; X86-NEXT:    movl %ebx, (%edx,%eax)
+; X86-NEXT:    btl %ecx, %esi
+; X86-NEXT:    setae %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -425,12 +428,12 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $96, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%ecx), %ecx
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
@@ -457,16 +460,17 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btcl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -498,16 +502,17 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btrl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -540,16 +545,17 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btsl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -582,20 +588,20 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $96, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%eax), %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    btrl %ecx, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%edx,%esi)
+; X86-NEXT:    movl %ebx, (%edx,%eax)
+; X86-NEXT:    btl %ecx, %esi
+; X86-NEXT:    setae %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -649,12 +655,12 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $60, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%ecx), %ecx
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
@@ -681,16 +687,17 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $60, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btcl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -722,16 +729,17 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $60, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btrl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -764,16 +772,17 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $60, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    btsl %eax, %edi
+; X86-NEXT:    movl %edi, (%edx,%ecx)
+; X86-NEXT:    btl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -806,20 +815,20 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%eax), %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    btrl %ecx, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, (%edx,%esi)
+; X86-NEXT:    movl %ebx, (%edx,%eax)
+; X86-NEXT:    btl %ecx, %esi
+; X86-NEXT:    setae %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -873,12 +882,12 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i4096:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $4064, %edx # imm = 0xFE0
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%eax,%edx), %eax
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $4064, %ecx # imm = 0xFE0
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%ecx), %ecx
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
@@ -906,18 +915,18 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_cmpz_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    andl $96, %ecx
 ; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    xorl %edx, (%eax,%ecx)
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    orl 12(%eax), %edx
-; X86-NEXT:    orl 8(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, (%edx,%ecx)
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    orl 12(%edx), %ecx
+; X86-NEXT:    orl 8(%edx), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
@@ -967,35 +976,32 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movzwl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzwl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzwl 14(%eax), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $16, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movzwl 2(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzwl 4(%eax), %edx
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movzwl 12(%ecx), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzwl 6(%eax), %esi
-; X86-NEXT:    movzwl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzwl 10(%eax), %eax
+; X86-NEXT:    movzwl 14(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movzwl 10(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movzwl 8(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movzwl 6(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movzwl 4(%ecx), %edi
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzwl 2(%ecx), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1005,29 +1011,26 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    andb $96, %bl
 ; X86-NEXT:    shrb $3, %bl
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    movl 32(%esp,%edi), %edi
-; X86-NEXT:    btcl %eax, %edi
+; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    movl 32(%esp,%ebx), %ebx
+; X86-NEXT:    btcl %eax, %ebx
 ; X86-NEXT:    andl $96, %eax
 ; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %edi, (%ecx,%eax)
+; X86-NEXT:    movl %ebx, (%ecx,%eax)
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movw %dx, 14(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movw %dx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 8(%eax)
-; X86-NEXT:    movw %si, 6(%eax)
+; X86-NEXT:    movw %cx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movw %cx, 10(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movw %cx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movw %cx, 6(%eax)
+; X86-NEXT:    movw %di, 4(%eax)
+; X86-NEXT:    movw %si, 2(%eax)
+; X86-NEXT:    movw %dx, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1107,17 +1110,17 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $96, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi,%edx), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    btrl %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    btrl %edx, %ebx
-; X86-NEXT:    btl %edx, %edi
-; X86-NEXT:    movl %ebx, (%ecx,%esi)
+; X86-NEXT:    movl %ebx, (%esi,%edx)
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    jae .LBB23_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
@@ -1161,42 +1164,37 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
 ; X86-LABEL: chain_reset_i256:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $-2, %edi
-; X86-NEXT:    roll %cl, %edi
+; X86-NEXT:    movl $-2, %edx
+; X86-NEXT:    roll %cl, %edx
 ; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    andl $28, %ecx
-; X86-NEXT:    andl %edi, (%esi,%ecx)
-; X86-NEXT:    movl 8(%esi), %ebx
-; X86-NEXT:    movl (%esi), %edi
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 12(%esi), %ebp
-; X86-NEXT:    orl 28(%esi), %ebp
-; X86-NEXT:    orl 20(%esi), %ecx
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    orl 24(%esi), %ebx
-; X86-NEXT:    movl 16(%esi), %ebp
-; X86-NEXT:    orl %edi, %ebp
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl %edi, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %edx, (%eax,%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl (%edi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    movl %edx, (%edi)
+; X86-NEXT:    movl 12(%eax), %edi
+; X86-NEXT:    orl 28(%eax), %edi
+; X86-NEXT:    orl 20(%eax), %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl 8(%eax), %edi
+; X86-NEXT:    orl 24(%eax), %edi
+; X86-NEXT:    orl 16(%eax), %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    jne .LBB24_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:  .LBB24_2:
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE2-LABEL: chain_reset_i256:
@@ -1287,8 +1285,6 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $144, %esp
-; X86-NEXT:    movb 20(%ebp), %ch
-; X86-NEXT:    movb 12(%ebp), %cl
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1297,21 +1293,19 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 20(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NEXT:    movl 60(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    movsbl %al, %edi
+; X86-NEXT:    movl 84(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%edi), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl 80(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1320,62 +1314,73 @@ define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 84(%esp,%eax), %edx
-; X86-NEXT:    movl 88(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movsbl %al, %esi
+; X86-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-NEXT:    movl 56(%esp,%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    xorl 8(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl 48(%esp,%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl 92(%esp,%edi), %edx
 ; X86-NEXT:    movzbl 20(%ebp), %ecx
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl 60(%esp,%esi), %eax
+; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movb 12(%ebp), %ch
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    xorl 12(%edi), %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, 12(%edi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movb 20(%ebp), %cl
+; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl 12(%eax), %esi
-; X86-NEXT:    xorl (%eax), %edi
-; X86-NEXT:    xorl 4(%eax), %ebx
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    xorl (%edi), %esi
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, (%edi)
+; X86-NEXT:    movb 20(%ebp), %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    xorl 4(%ecx), %ebx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    andb $96, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 96(%esp,%eax), %eax
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $96, %cl
+; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movl 96(%esp,%ecx), %ecx
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setae %al
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1517,128 +1522,132 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $240, %esp
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl 12(%ebx), %esi
-; X86-NEXT:    movl 28(%ebx), %eax
-; X86-NEXT:    movl 60(%ebx), %ecx
+; X86-NEXT:    subl $224, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 60(%esi), %ecx
+; X86-NEXT:    movl 28(%esi), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 44(%ebx), %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl 44(%esi), %edx
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 20(%ebx), %edx
-; X86-NEXT:    movl 52(%ebx), %eax
+; X86-NEXT:    movl 52(%esi), %eax
+; X86-NEXT:    movl 20(%esi), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl 4(%ebx), %edi
-; X86-NEXT:    movl 36(%ebx), %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 36(%esi), %edi
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 24(%ebx), %edx
-; X86-NEXT:    movl 56(%ebx), %ecx
+; X86-NEXT:    movl 56(%esi), %ecx
+; X86-NEXT:    movl 24(%esi), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl 8(%ebx), %ecx
-; X86-NEXT:    movl 40(%ebx), %esi
+; X86-NEXT:    movl 40(%esi), %edi
+; X86-NEXT:    movl 8(%esi), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl 16(%ebx), %edx
-; X86-NEXT:    movl 48(%ebx), %esi
+; X86-NEXT:    movl 48(%esi), %edi
+; X86-NEXT:    movl 16(%esi), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl (%ebx), %esi
-; X86-NEXT:    movl 32(%ebx), %ebx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl (%esi), %ebx
+; X86-NEXT:    movl 32(%esi), %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB26_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    jne .LBB26_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    rep bsfl %edi, %ecx
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    jmp .LBB26_5
 ; X86-NEXT:  .LBB26_1:
-; X86-NEXT:    movl $512, %ecx # imm = 0x200
-; X86-NEXT:    jmp .LBB26_41
+; X86-NEXT:    movl $512, %ebx # imm = 0x200
+; X86-NEXT:    jmp .LBB26_40
 ; X86-NEXT:  .LBB26_3:
-; X86-NEXT:    rep bsfl %ebx, %eax
-; X86-NEXT:  .LBB26_5: # %cond.false
+; X86-NEXT:    rep bsfl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:  .LBB26_5: # %cond.false
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB26_6
 ; X86-NEXT:  # %bb.7: # %cond.false
-; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    jmp .LBB26_8
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    je .LBB26_9
+; X86-NEXT:    jmp .LBB26_10
 ; X86-NEXT:  .LBB26_6:
-; X86-NEXT:    rep bsfl %ecx, %ecx
-; X86-NEXT:  .LBB26_8: # %cond.false
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    jne .LBB26_10
-; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    addl $64, %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB26_9: # %cond.false
+; X86-NEXT:    addl $64, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:  .LBB26_10: # %cond.false
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jne .LBB26_11
 ; X86-NEXT:  # %bb.12: # %cond.false
-; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    rep bsfl %ebx, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    je .LBB26_15
 ; X86-NEXT:  .LBB26_14:
 ; X86-NEXT:    rep bsfl %edx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    je .LBB26_17
 ; X86-NEXT:    jmp .LBB26_18
 ; X86-NEXT:  .LBB26_11:
-; X86-NEXT:    rep bsfl %esi, %ecx
+; X86-NEXT:    rep bsfl %esi, %eax
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB26_14
 ; X86-NEXT:  .LBB26_15: # %cond.false
 ; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    jne .LBB26_18
 ; X86-NEXT:  .LBB26_17: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB26_18: # %cond.false
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    jne .LBB26_20
 ; X86-NEXT:  # %bb.19: # %cond.false
-; X86-NEXT:    subl $-128, %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    subl $-128, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:  .LBB26_20: # %cond.false
-; X86-NEXT:    addl $256, %eax # imm = 0x100
+; X86-NEXT:    addl $256, %ecx # imm = 0x100
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    jne .LBB26_21
 ; X86-NEXT:  # %bb.22: # %cond.false
@@ -1648,29 +1657,29 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:  .LBB26_21:
 ; X86-NEXT:    rep bsfl %edx, %ebx
 ; X86-NEXT:  .LBB26_23: # %cond.false
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB26_24
 ; X86-NEXT:  # %bb.25: # %cond.false
-; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    je .LBB26_27
 ; X86-NEXT:    jmp .LBB26_28
 ; X86-NEXT:  .LBB26_24:
-; X86-NEXT:    rep bsfl %ecx, %ecx
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    jne .LBB26_28
 ; X86-NEXT:  .LBB26_27: # %cond.false
-; X86-NEXT:    addl $64, %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    addl $64, %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:  .LBB26_28: # %cond.false
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jne .LBB26_29
 ; X86-NEXT:  # %bb.30: # %cond.false
-; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    je .LBB26_33
 ; X86-NEXT:  .LBB26_32:
@@ -1679,7 +1688,7 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:    je .LBB26_35
 ; X86-NEXT:    jmp .LBB26_36
 ; X86-NEXT:  .LBB26_29:
-; X86-NEXT:    rep bsfl %esi, %ecx
+; X86-NEXT:    rep bsfl %esi, %eax
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB26_32
 ; X86-NEXT:  .LBB26_33: # %cond.false
@@ -1689,7 +1698,7 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:    jne .LBB26_36
 ; X86-NEXT:  .LBB26_35: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB26_36: # %cond.false
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1698,34 +1707,31 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    jne .LBB26_38
 ; X86-NEXT:  # %bb.37: # %cond.false
-; X86-NEXT:    subl $-128, %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    subl $-128, %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:  .LBB26_38: # %cond.false
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    jne .LBB26_40
 ; X86-NEXT:  # %bb.39: # %cond.false
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:  .LBB26_40: # %cond.false
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:  .LBB26_41: # %cond.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    andl $60, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %esi, %edx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:  .LBB26_40: # %cond.end
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $60, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl %eax, %edi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1758,132 +1764,105 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl 56(%edx), %edi
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %eax
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl 20(%edi), %ebx
+; X86-NEXT:    movl 24(%edi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl 16(%edi), %edx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 12(%edi), %esi
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, 16(%eax)
+; X86-NEXT:    movl 8(%edi), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl 4(%edi), %edx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl (%edi), %esi
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    shldl %cl, %esi, %edx
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl 28(%edi), %edx
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl %cl, %eax, %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebx), %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %esi, 28(%ebx)
+; X86-NEXT:    movl 32(%edi), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 32(%eax)
+; X86-NEXT:    movl 36(%edi), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 36(%eax)
+; X86-NEXT:    movl 40(%edi), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 40(%eax)
+; X86-NEXT:    movl 44(%edi), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 44(%eax)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl 192(%esp,%eax), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%ebx), %edx
+; X86-NEXT:    movl %esi, 48(%ebx)
+; X86-NEXT:    movl 52(%edi), %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    shldl %cl, %eax, %esi
 ; X86-NEXT:    notl %esi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 208(%esp,%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %esi, 52(%ebx)
+; X86-NEXT:    movl 56(%edi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 56(%ebx)
+; X86-NEXT:    movl 60(%edi), %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl (%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, 24(%ecx)
-; X86-NEXT:    movl %esi, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 16(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edi, 4(%ecx)
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 44(%ecx)
-; X86-NEXT:    movl %edx, 48(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 52(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 56(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 60(%ecx)
+; X86-NEXT:    movl %edx, 60(%ebx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/blend-of-shift.ll b/llvm/test/CodeGen/X86/blend-of-shift.ll
index cf382c7903bd7..5698096bb571c 100644
--- a/llvm/test/CodeGen/X86/blend-of-shift.ll
+++ b/llvm/test/CodeGen/X86/blend-of-shift.ll
@@ -261,12 +261,19 @@ define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_shl_i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_shl_i16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_shl_i16:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
   %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15)
   %i3 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -283,12 +290,19 @@ define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_lshr_i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $15, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_lshr_i16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_lshr_i16:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
   %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15)
   %i3 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -305,12 +319,19 @@ define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_ashr_i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
-; AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_ashr_i16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_ashr_i16:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
   %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15)
   %i3 = bitcast <8 x i16> %i1 to <2 x i64>
@@ -328,12 +349,19 @@ define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_shl_i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_shl_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_shl_i32:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
   %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31)
   %i3 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -350,12 +378,19 @@ define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_lshr_i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_lshr_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_lshr_i32:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrld $31, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
   %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31)
   %i3 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -372,12 +407,19 @@ define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_ashr_i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_ashr_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_ashr_i32:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
   %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31)
   %i3 = bitcast <4 x i32> %i1 to <2 x i64>
@@ -395,12 +437,19 @@ define <2 x i64> @shuffle_i64_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_shl_i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
-; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_shl_i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_shl_i64:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63)
   %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63)
   %i3 = bitcast <2 x i64> %i1 to <2 x i64>
@@ -417,12 +466,19 @@ define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; AVX2-LABEL: shuffle_i64_of_lshr_i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrlq $63, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlq $63, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X64-AVX2-LABEL: shuffle_i64_of_lshr_i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX2-LABEL: shuffle_i64_of_lshr_i64:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    retl
   %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63)
   %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63)
   %i3 = bitcast <2 x i64> %i1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/bmi-out-of-order.ll b/llvm/test/CodeGen/X86/bmi-out-of-order.ll
index 3e23a5f348bde..e87c0cd926dc0 100644
--- a/llvm/test/CodeGen/X86/bmi-out-of-order.ll
+++ b/llvm/test/CodeGen/X86/bmi-out-of-order.ll
@@ -35,9 +35,9 @@ define i64 @blsmask_through1(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -86,13 +86,13 @@ define i64 @blsmask_through3(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -150,25 +150,25 @@ define i64 @blsmask_through1_used2(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    addl $-1, %edi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    adcl $-1, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %eax
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    imull %edi, %ebx
 ; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -220,15 +220,15 @@ define i64 @blsi_through1(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: blsi_through1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -271,17 +271,17 @@ define i64 @blsi_through3(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-LABEL: blsi_through3:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -333,31 +333,34 @@ entry:
 define i64 @blsi_through1_used2(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: blsi_through1_used2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    negl %edx
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    imull %edx, %ebx
-; X86-NEXT:    imull %eax, %edi
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    andl %ebx, %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    andl %edi, %ebp
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: blsi_through1_used2:
@@ -411,9 +414,9 @@ define i64 @blsr_through1(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    andl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -456,19 +459,19 @@ define i64 @blsr_through3(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
 ; X86-LABEL: blsr_through3:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -527,25 +530,25 @@ define i64 @blsr_through1_used2(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    addl $-1, %edi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    adcl $-1, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    andl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %ebx, %ecx
+; X86-NEXT:    andl %ecx, %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    imull %edi, %ebx
 ; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    andl %esi, %edi
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
index e5696ded4fbf1..f5fbaa28127e7 100644
--- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -128,19 +128,17 @@ define i32 @and_neg_select_pos_i32(i1 %a0, i32 inreg %a1) nounwind {
 define i16 @and_select_neg_i16(i1 %a0, i16 %a1) nounwind {
 ; X86-LABEL: and_select_neg_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpb $1, %al
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    andl %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_neg_i16:
@@ -230,8 +228,8 @@ define i32 @and_select_neg_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    andl %edx, %eax
 ; X86-NEXT:    retl
@@ -258,10 +256,10 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    negl %edx
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    cmpb $1, %cl
 ; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    negl %edx
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    andl %esi, %eax
 ; X86-NEXT:    popl %esi
@@ -427,18 +425,16 @@ define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 define i16 @and_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpb $1, %al
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    leal -1(%edx), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    leal -1(%edx), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    andl %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_sub_1_i16:
@@ -492,16 +488,14 @@ define <4 x i32> @and_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
 define i32 @and_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_no_sub_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -2(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    leal -2(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: and_select_no_sub_1:
@@ -526,8 +520,8 @@ define i32 @and_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_wrong_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    andl %edx, %eax
 ; X86-NEXT:    retl
@@ -554,12 +548,12 @@ define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2)
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    decl %edx
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    cmpb $1, %cl
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    andl %esi, %eax
+; X86-NEXT:    leal -1(%edx), %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -809,16 +803,14 @@ define i32 @xor_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
 define i32 @xor_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: xor_select_sub_1_wrong_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    leal -1(%eax), %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    xorl %esi, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    leal -1(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_select_sub_1_wrong_const:
diff --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll
index f37176d7c0a37..30e86da4c7571 100644
--- a/llvm/test/CodeGen/X86/bmi.ll
+++ b/llvm/test/CodeGen/X86/bmi.ll
@@ -237,7 +237,7 @@ define i1 @and_cmp_const_power_of_two(i32 %x, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
@@ -560,9 +560,9 @@ define i32 @bextr32b_load(ptr %x)  uwtable  ssp {
 ;
 ; X86-FAST-BEXTR-LABEL: bextr32b_load:
 ; X86-FAST-BEXTR:       # %bb.0:
-; X86-FAST-BEXTR-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-BEXTR-NEXT:    movl $3076, %ecx # imm = 0xC04
-; X86-FAST-BEXTR-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-FAST-BEXTR-NEXT:    movl $3076, %eax # imm = 0xC04
+; X86-FAST-BEXTR-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-BEXTR-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-FAST-BEXTR-NEXT:    retl
 ;
 ; X64-FAST-BEXTR-LABEL: bextr32b_load:
@@ -773,9 +773,9 @@ define i64 @blsi64(i64 %x)   {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
@@ -803,9 +803,9 @@ define i64 @blsi64_z(i64 %a, i64 %b) nounwind {
 ; X86-LABEL: blsi64_z:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
@@ -844,20 +844,20 @@ define i64 @blsi64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-LABEL: blsi64_z2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andl %ecx, %esi
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    cmovel %ecx, %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -886,21 +886,21 @@ define i64 @blsi64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-LABEL: blsi64_sle:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    cmpl $1, %esi
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andl %ecx, %esi
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl $1, %esi
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -999,9 +999,9 @@ define i32 @blsmsk32_z(i32 %a, i32 %b) nounwind {
 define i32 @blsmsk32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; X86-LABEL: blsmsk32_z2:
 ; X86:       # %bb.0:
-; X86-NEXT:    blsmskl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    blsmskl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovel %eax, %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    retl
@@ -1142,9 +1142,9 @@ define i64 @blsmsk64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    cmovel %eax, %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    movl 4(%ecx), %edx
@@ -1183,10 +1183,10 @@ define i64 @blsmsk64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    cmpl $1, %edx
-; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmovll %eax, %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    movl 4(%ecx), %edx
@@ -1432,9 +1432,9 @@ define i64 @blsr64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    andl %eax, %edx
 ; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    cmovel %eax, %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    movl 4(%ecx), %edx
@@ -1474,10 +1474,10 @@ define i64 @blsr64_sle(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    andl %ecx, %esi
 ; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    cmpl $1, %edx
-; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmovll %eax, %ecx
 ; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    movl 4(%ecx), %edx
@@ -1792,9 +1792,9 @@ define i64 @blsi64_branch(i64 %x) {
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    negl %edi
 ; X86-NEXT:    sbbl %ecx, %esi
@@ -1982,9 +1982,9 @@ define i64 @blsi_cflag_64(i64 %x, i64 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
@@ -2026,15 +2026,15 @@ define i64 @blsi64_not(i64 %x) nounwind {
 ; X86-LABEL: blsi64_not:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    notl %edx
+; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index cabeebb0c3f36..e024a9260b5bc 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -7,9 +7,9 @@ define i32 @bzhi32(i32 %x, i32 %y)   {
 ; X86-LABEL: bzhi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    bzhil %eax, %ecx, %eax
+; X86-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bzhi32:
@@ -82,9 +82,9 @@ define i32 @pdep32(i32 %x, i32 %y)   {
 ; X86-LABEL: pdep32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    pdepl %ecx, %eax, %eax
+; X86-NEXT:    pdepl %eax, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32:
@@ -128,9 +128,9 @@ define i32 @pdep32_load(i32 %x, ptr %y)   {
 define i32 @pdep32_anyext(i16 %x)   {
 ; X86-LABEL: pdep32_anyext:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $-1431655766, %ecx # imm = 0xAAAAAAAA
-; X86-NEXT:    pdepl %ecx, %eax, %eax
+; X86-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pdepl %eax, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_anyext:
@@ -153,9 +153,9 @@ define i32 @pdep32_anyext(i16 %x)   {
 define i32 @pdep32_demandedbits(i32 %x) {
 ; X86-LABEL: pdep32_demandedbits:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    pdepl %ecx, %eax, %eax
+; X86-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pdepl %eax, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_demandedbits:
@@ -256,9 +256,9 @@ define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) {
 define i32 @pdep32_knownbits(i32 %x) {
 ; X86-LABEL: pdep32_knownbits:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    pdepl %ecx, %eax, %eax
+; X86-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pdepl %eax, %ecx, %eax
 ; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    retl
 ;
@@ -317,9 +317,9 @@ define i32 @pext32(i32 %x, i32 %y)   {
 ; X86-LABEL: pext32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    pextl %ecx, %eax, %eax
+; X86-NEXT:    pextl %eax, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pext32:
@@ -363,9 +363,9 @@ define i32 @pext32_load(i32 %x, ptr %y)   {
 define i32 @pext32_knownbits(i32 %x)   {
 ; X86-LABEL: pext32_knownbits:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
-; X86-NEXT:    pextl %ecx, %eax, %eax
+; X86-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pextl %eax, %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pext32_knownbits:
@@ -390,13 +390,13 @@ declare i32 @llvm.x86.bmi.pext.32(i32, i32)
 define i32 @mulx32(i32 %x, i32 %y, ptr %p)   {
 ; X86-LABEL: mulx32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    mulxl %eax, %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    mulxl %eax, %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mulx32:
@@ -439,12 +439,12 @@ define i32 @mulx32(i32 %x, i32 %y, ptr %p)   {
 define i32 @mulx32_load(i32 %x, ptr %y, ptr %p)   {
 ; X86-LABEL: mulx32_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    mulxl (%eax), %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mulxl (%eax), %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mulx32_load:
diff --git a/llvm/test/CodeGen/X86/bool-vector.ll b/llvm/test/CodeGen/X86/bool-vector.ll
index d52b455eb2e6b..b816bf876f240 100644
--- a/llvm/test/CodeGen/X86/bool-vector.ll
+++ b/llvm/test/CodeGen/X86/bool-vector.ll
@@ -9,20 +9,20 @@
 define i32 @PR15215_bad(<4 x i32> %input) nounwind {
 ; X86-LABEL: PR15215_bad:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    shlb $3, %ah
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    shlb $2, %cl
-; X86-NEXT:    orb %ah, %cl
-; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    shlb $3, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andb $1, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    andb $3, %al
+; X86-NEXT:    shlb $2, %al
 ; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andb $1, %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    andb $3, %dl
+; X86-NEXT:    orb %al, %dl
+; X86-NEXT:    movzbl %dl, %eax
 ; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    retl
 ;
@@ -62,19 +62,17 @@ entry:
 define i32 @PR15215_good(<4 x i32> %input) nounwind {
 ; X86-LABEL: PR15215_good:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    andl $1, %edx
 ; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    leal (%esi,%edx,2), %edx
-; X86-NEXT:    leal (%edx,%ecx,4), %ecx
-; X86-NEXT:    leal (%ecx,%eax,8), %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    leal (%eax,%ecx,8), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR15215_good:
diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 37917b4c8dd55..659dd9f887a13 100644
--- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -244,29 +244,29 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
 
 
 define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
-; AVX1-LABEL: f64xi8_i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f64xi8_i16:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f64xi8_i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
-; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f64xi8_i16:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f64xi8_i16:
 ; X86-AVX512F:       # %bb.0:
@@ -285,6 +285,30 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f64xi8_i16:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f64xi8_i16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f64xi8_i16:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -301,29 +325,29 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
 
 
 define <64 x i8> @f64i8_i32(<64 x i8> %a) {
-; AVX1-LABEL: f64i8_i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f64i8_i32:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f64i8_i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f64i8_i32:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f64i8_i32:
 ; X86-AVX512F:       # %bb.0:
@@ -342,6 +366,30 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f64i8_i32:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f64i8_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f64i8_i32:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -358,29 +406,29 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 
 
 define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
-; AVX1-LABEL: f64xi8_i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f64xi8_i64:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f64xi8_i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f64xi8_i64:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f64xi8_i64:
 ; X86-AVX512F:       # %bb.0:
@@ -399,6 +447,30 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f64xi8_i64:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f64xi8_i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f64xi8_i64:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -415,31 +487,31 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 
 
 define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
-; AVX1-LABEL: f64xi8_i128:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f64xi8_i128:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X86-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f64xi8_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f64xi8_i128:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X86-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f64xi8_i128:
 ; X86-AVX512F:       # %bb.0:
@@ -460,6 +532,32 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f64xi8_i128:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X64-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f64xi8_i128:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X64-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f64xi8_i128:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -477,30 +575,30 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 
 
 define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
-; AVX1-LABEL: f64xi8_i256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f64xi8_i256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; X86-AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; X86-AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f64xi8_i256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f64xi8_i256:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512F-LABEL: f64xi8_i256:
 ; AVX512F:       # %bb.0:
@@ -520,6 +618,31 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f64xi8_i256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; X64-AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; X64-AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f64xi8_i256:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = add <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %a
   %res2 = and <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, %res1
   ret <64 x i8> %res2
@@ -697,29 +820,29 @@ define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
 
 
 define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
-; AVX1-LABEL: f32xi16_i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f32xi16_i32:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f32xi16_i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
-; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f32xi16_i32:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f32xi16_i32:
 ; X86-AVX512F:       # %bb.0:
@@ -738,6 +861,30 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f32xi16_i32:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f32xi16_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3]
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f32xi16_i32:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -754,29 +901,29 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 
 
 define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
-; AVX1-LABEL: f32xi16_i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f32xi16_i64:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f32xi16_i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f32xi16_i64:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f32xi16_i64:
 ; X86-AVX512F:       # %bb.0:
@@ -795,6 +942,30 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f32xi16_i64:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f32xi16_i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f32xi16_i64:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -811,31 +982,31 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 
 
 define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
-; AVX1-LABEL: f32xi16_i128:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f32xi16_i128:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X86-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f32xi16_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f32xi16_i128:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X86-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512F-LABEL: f32xi16_i128:
 ; X86-AVX512F:       # %bb.0:
@@ -856,6 +1027,32 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
 ;
+; X64-AVX1-LABEL: f32xi16_i128:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X64-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f32xi16_i128:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512F-LABEL: f32xi16_i128:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
@@ -873,30 +1070,30 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
 
 
 define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
-; AVX1-LABEL: f32xi16_i256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f32xi16_i256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
+; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X86-AVX1-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f32xi16_i256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f32xi16_i256:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512F-LABEL: f32xi16_i256:
 ; AVX512F:       # %bb.0:
@@ -916,6 +1113,31 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f32xi16_i256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
+; X64-AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X64-AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f32xi16_i256:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = add <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
   %res2 = and <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %res1
   ret <32 x i16> %res2
@@ -1026,29 +1248,29 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
 
 
 define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
-; AVX1-LABEL: f16xi32_i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f16xi32_i64:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f16xi32_i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3]
-; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f16xi32_i64:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3]
+; X86-AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f16xi32_i64:
 ; AVX512:       # %bb.0:
@@ -1056,6 +1278,30 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f16xi32_i64:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f16xi32_i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,3,0,3,0,3,0,3]
+; X64-AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = add <16 x i32> <i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3>, %a
   %res2 = and <16 x i32> <i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3, i32 0, i32 3>, %res1
   ret <16 x i32> %res2
@@ -1063,31 +1309,31 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 
 
 define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
-; AVX1-LABEL: f16xi32_i128:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
-; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f16xi32_i128:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; X86-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f16xi32_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f16xi32_i128:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; X86-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f16xi32_i128:
 ; AVX512:       # %bb.0:
@@ -1096,6 +1342,32 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f16xi32_i128:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; X64-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f16xi32_i128:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; X64-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
   %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
   ret <16 x i32> %res2
@@ -1167,15 +1439,15 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,0,3,0,0,0,3,0]
 ; X86-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; X86-AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    retl
 ;
@@ -1183,9 +1455,9 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,3,0,0,0,3,0]
 ; X86-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; X86-AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    retl
 ;
@@ -1239,28 +1511,28 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
 define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
 ; X86-AVX1-LABEL: f8xi64_i256:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; X86-AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [2,3]
 ; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,1,0,2,0,3,0]
-; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f8xi64_i256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3]
-; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f8xi64_i256:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3]
+; X86-AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: f8xi64_i256:
 ; X86-AVX512:       # %bb.0:
@@ -1286,6 +1558,15 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
 ; X64-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; X64-AVX1-NEXT:    retq
 ;
+; X64-AVX2-LABEL: f8xi64_i256:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,1,2,3]
+; X64-AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    retq
+;
 ; X64-AVX512-LABEL: f8xi64_i256:
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3]
@@ -1341,23 +1622,23 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 
 
 define <16 x float> @f16xf32_f64(<16 x float> %a) {
-; AVX1-LABEL: f16xf32_f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
-; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f16xf32_f64:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X86-AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f16xf32_f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
-; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f16xf32_f64:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f16xf32_f64:
 ; AVX512:       # %bb.0:
@@ -1365,6 +1646,24 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f16xf32_f64:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X64-AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f16xf32_f64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X64-AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = fadd <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %a
   %res2 = fdiv <16 x float> <float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0, float 2.0, float 1.0>, %res1
   ret <16 x float> %res2
@@ -1372,25 +1671,25 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 
 
 define <16 x float> @f16xf32_f128(<16 x float> %a) {
-; AVX1-LABEL: f16xf32_f128:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
-; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f16xf32_f128:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X86-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f16xf32_f128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f16xf32_f128:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X86-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f16xf32_f128:
 ; AVX512:       # %bb.0:
@@ -1399,6 +1698,26 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f16xf32_f128:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X64-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f16xf32_f128:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X64-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = fadd <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %a
   %res2 = fdiv <16 x float> <float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0, float 4.0, float 1.0, float 2.0, float 3.0>, %res1
   ret <16 x float> %res2
@@ -1406,23 +1725,23 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 
 
 define <16 x float> @f16xf32_f256(<16 x float> %a) {
-; AVX1-LABEL: f16xf32_f256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
-; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f16xf32_f256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X86-AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f16xf32_f256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
-; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f16xf32_f256:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X86-AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f16xf32_f256:
 ; AVX512:       # %bb.0:
@@ -1431,6 +1750,24 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f16xf32_f256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X64-AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f16xf32_f256:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
+; X64-AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vdivps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = fadd <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %a
   %res2 = fdiv <16 x float> <float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, %res1
   ret <16 x float> %res2
@@ -1452,25 +1789,25 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 
 
 define <8 x double> @f8xf64_f128(<8 x double> %a) {
-; AVX1-LABEL: f8xf64_f128:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
-; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f8xf64_f128:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X86-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f8xf64_f128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
-; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f8xf64_f128:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X86-AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f8xf64_f128:
 ; AVX512:       # %bb.0:
@@ -1479,6 +1816,26 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f8xf64_f128:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X64-AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f8xf64_f128:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
+; X64-AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; X64-AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = fadd <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %a
   %res2 = fdiv <8 x double> <double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0, double 2.0, double 1.0>, %res1
   ret <8 x double> %res2
@@ -1486,23 +1843,23 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 
 
 define <8 x double> @f8xf64_f256(<8 x double> %a) {
-; AVX1-LABEL: f8xf64_f256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
-; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: f8xf64_f256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X86-AVX1-NEXT:    retl
 ;
-; AVX2-LABEL: f8xf64_f256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
-; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: f8xf64_f256:
+; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X86-AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    retl
 ;
 ; AVX512-LABEL: f8xf64_f256:
 ; AVX512:       # %bb.0:
@@ -1511,6 +1868,24 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: f8xf64_f256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X64-AVX1-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: f8xf64_f256:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
+; X64-AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vdivpd %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    retq
   %res1 = fadd <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %a
   %res2 = fdiv <8 x double> <double 4.0, double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0>, %res1
   ret <8 x double> %res2
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index f780f0172647f..e8061dbad1701 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -266,66 +266,83 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edx
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    orl 36(%ebp), %eax
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl %edi, %ebx
-; X86-NEXT:    addl $32, %ebx
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    rep bsfl %edx, %ecx
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    rep bsfl %ecx, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl $128, %ebx
+; X86-NEXT:    movl $128, %ecx
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    rep bsfl %edx, %ebx
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    rep bsfl %ebx, %ecx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    rep bsfl 36(%ebp), %esi
-; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    rep bsfl %edi, %eax
+; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    addl $64, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl 36(%ebp), %edi
-; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    orl %ebx, %edx
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    jmp .LBB8_14
-; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    jne .LBB8_16
+; X86-NEXT:  .LBB8_15:
 ; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    movl 40(%ebp), %ebx
-; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    je .LBB8_18
+; X86-NEXT:    jmp .LBB8_19
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    je .LBB8_15
+; X86-NEXT:  .LBB8_16: # %cond.end
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    jne .LBB8_19
+; X86-NEXT:  .LBB8_18:
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:  .LBB8_19: # %cond.end
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    jne .LBB8_21
+; X86-NEXT:  # %bb.20:
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_21: # %cond.end
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -368,26 +385,26 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 36(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB9_11
 ; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
 ; X86-NEXT:  # %bb.3: # %select.true.sink
-; X86-NEXT:    rep bsfl %ecx, %ebx
-; X86-NEXT:    addl $32, %ebx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    rep bsfl %ecx, %edi
+; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    je .LBB9_6
 ; X86-NEXT:  .LBB9_5:
-; X86-NEXT:    rep bsfl %edi, %esi
+; X86-NEXT:    rep bsfl %ebx, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    je .LBB9_8
 ; X86-NEXT:    jmp .LBB9_9
@@ -402,8 +419,8 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    jmp .LBB9_10
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    rep bsfl %edx, %ebx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    rep bsfl %edx, %edi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    jne .LBB9_5
 ; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
@@ -412,9 +429,9 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:  .LBB9_9: # %select.true.sink
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index affacc5ee6487..c1a4c9f5d711c 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -294,75 +294,84 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %esi
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    bsrl %edi, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    bsrl %eax, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    je .LBB8_9
 ; X86-NEXT:    jmp .LBB8_10
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    movl $128, %ecx
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    bsrl %esi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    bsrl %ecx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    bsrl %esi, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  .LBB8_9: # %cond.false
-; X86-NEXT:    orl $64, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    orl $64, %ebx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl $127, %esi
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    jne .LBB8_16
+; X86-NEXT:  .LBB8_15:
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    je .LBB8_18
+; X86-NEXT:    jmp .LBB8_19
 ; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl 52(%ebp), %edi
-; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    je .LBB8_15
+; X86-NEXT:  .LBB8_16: # %cond.end
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    jne .LBB8_19
+; X86-NEXT:  .LBB8_18:
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:  .LBB8_19: # %cond.end
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    jne .LBB8_21
+; X86-NEXT:  # %bb.20:
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_21: # %cond.end
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -404,66 +413,78 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %eax, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    bsrl %ebx, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    bsrl %esi, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    je .LBB9_7
 ; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    bsrl %edi, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    jne .LBB9_8
 ; X86-NEXT:  .LBB9_7:
-; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl $64, %ebx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:  .LBB9_8:
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl 32(%ebp), %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    jne .LBB9_9
-; X86-NEXT:  # %bb.10:
-; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl 52(%ebp), %edi
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    jmp .LBB9_11
-; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    xorl $127, %esi
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    orl 32(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    jne .LBB9_10
+; X86-NEXT:  # %bb.9:
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:  .LBB9_10:
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    je .LBB9_11
+; X86-NEXT:  # %bb.12:
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    je .LBB9_13
+; X86-NEXT:  .LBB9_14:
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    jne .LBB9_16
+; X86-NEXT:  .LBB9_15:
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB9_16:
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    jne .LBB9_14
+; X86-NEXT:  .LBB9_13:
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    je .LBB9_15
+; X86-NEXT:    jmp .LBB9_16
 ;
 ; X64-LABEL: cmov_bsr128_undef:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 673b7f16de75c..131511a8f23e1 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -43,25 +43,22 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    bswapl %edi
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -69,21 +66,18 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-MOVBE:       # %bb.0:
 ; X86-MOVBE-NEXT:    pushl %ebp
 ; X86-MOVBE-NEXT:    movl %esp, %ebp
-; X86-MOVBE-NEXT:    pushl %edi
-; X86-MOVBE-NEXT:    pushl %esi
 ; X86-MOVBE-NEXT:    andl $-16, %esp
+; X86-MOVBE-NEXT:    subl $16, %esp
 ; X86-MOVBE-NEXT:    movl 8(%ebp), %eax
+; X86-MOVBE-NEXT:    movl 24(%ebp), %ecx
+; X86-MOVBE-NEXT:    movl 28(%ebp), %edx
+; X86-MOVBE-NEXT:    movbel %ecx, 12(%eax)
+; X86-MOVBE-NEXT:    movbel %edx, 8(%eax)
 ; X86-MOVBE-NEXT:    movl 32(%ebp), %ecx
 ; X86-MOVBE-NEXT:    movl 36(%ebp), %edx
-; X86-MOVBE-NEXT:    movl 24(%ebp), %esi
-; X86-MOVBE-NEXT:    movl 28(%ebp), %edi
-; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
-; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
 ; X86-MOVBE-NEXT:    movbel %edx, (%eax)
-; X86-MOVBE-NEXT:    leal -8(%ebp), %esp
-; X86-MOVBE-NEXT:    popl %esi
-; X86-MOVBE-NEXT:    popl %edi
+; X86-MOVBE-NEXT:    movl %ebp, %esp
 ; X86-MOVBE-NEXT:    popl %ebp
 ; X86-MOVBE-NEXT:    retl $4
 ;
@@ -109,52 +103,52 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 define i256 @bswap_i256(i256 %a0) nounwind {
 ; X86-LABEL: bswap_i256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, 24(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, 16(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X86-MOVBE-LABEL: bswap_i256:
 ; X86-MOVBE:       # %bb.0:
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-MOVBE-NEXT:    movbel %ecx, 28(%eax)
+; X86-MOVBE-NEXT:    movbel %edx, 24(%eax)
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movbel %ecx, 24(%eax)
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-MOVBE-NEXT:    movbel %ecx, 20(%eax)
+; X86-MOVBE-NEXT:    movbel %edx, 16(%eax)
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movbel %ecx, 16(%eax)
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-MOVBE-NEXT:    movbel %ecx, 12(%eax)
+; X86-MOVBE-NEXT:    movbel %edx, 8(%eax)
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movbel %ecx, 8(%eax)
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movbel %ecx, (%eax)
+; X86-MOVBE-NEXT:    movbel %edx, (%eax)
 ; X86-MOVBE-NEXT:    retl $4
 ;
 ; X64-LABEL: bswap_i256:
diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index 81eac5676bb5c..0a45bc43d436e 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -69,17 +69,15 @@ define i64 @Y(i64 %A) {
 define dso_local i32 @bswap_multiuse(i32 %x, i32 %y, ptr %p1, ptr %p2) nounwind {
 ; CHECK-LABEL: bswap_multiuse:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %ecx, (%eax)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    bswapl %esi
 ; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    movl %esi, (%edx)
-; CHECK-NEXT:    movl %eax, (%ecx)
-; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %eax, (%edx)
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: bswap_multiuse:
@@ -157,8 +155,8 @@ define i64 @not_bswap() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl var16, %eax
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl $8, %ecx
-; CHECK-NEXT:    shll $8, %eax
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    shrl $8, %eax
 ; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    retl
@@ -246,107 +244,80 @@ define i64 @finally_useful_bswap() {
 define i528 @large_promotion(i528 %A) nounwind {
 ; CHECK-LABEL: large_promotion:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $44, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %esi
 ; CHECK-NEXT:    bswapl %ecx
-; CHECK-NEXT:    shrdl $16, %ecx, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    shldl $16, %esi, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %edx, 60(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shldl $16, %edi, %esi
+; CHECK-NEXT:    movl %esi, 56(%eax)
 ; CHECK-NEXT:    bswapl %edx
-; CHECK-NEXT:    shrdl $16, %edx, %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shldl $16, %edx, %edi
+; CHECK-NEXT:    movl %edi, 52(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shldl $16, %edi, %edx
+; CHECK-NEXT:    movl %edx, 48(%eax)
 ; CHECK-NEXT:    bswapl %esi
-; CHECK-NEXT:    shrdl $16, %esi, %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shldl $16, %esi, %edi
+; CHECK-NEXT:    movl %edi, 44(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    bswapl %edi
-; CHECK-NEXT:    shrdl $16, %edi, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    bswapl %ebx
-; CHECK-NEXT:    shrdl $16, %ebx, %edi
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    bswapl %ebp
-; CHECK-NEXT:    shrdl $16, %ebp, %ebx
-; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    shrdl $16, %eax, %ebp
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    bswapl %ecx
-; CHECK-NEXT:    shrdl $16, %ecx, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    shrdl $16, %eax, %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    bswapl %ecx
-; CHECK-NEXT:    shrdl $16, %ecx, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    bswapl %ebp
-; CHECK-NEXT:    shrdl $16, %ebp, %ecx
-; CHECK-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    bswapl %ebx
-; CHECK-NEXT:    shrdl $16, %ebx, %ebp
+; CHECK-NEXT:    shldl $16, %edi, %esi
+; CHECK-NEXT:    movl %esi, 40(%eax)
+; CHECK-NEXT:    bswapl %edx
+; CHECK-NEXT:    shldl $16, %edx, %edi
+; CHECK-NEXT:    movl %edi, 36(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shldl $16, %edi, %edx
+; CHECK-NEXT:    movl %edx, 32(%eax)
 ; CHECK-NEXT:    bswapl %esi
-; CHECK-NEXT:    shrdl $16, %esi, %ebx
+; CHECK-NEXT:    shldl $16, %esi, %edi
+; CHECK-NEXT:    movl %edi, 28(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shldl $16, %edi, %esi
+; CHECK-NEXT:    movl %esi, 24(%eax)
 ; CHECK-NEXT:    bswapl %edx
-; CHECK-NEXT:    shrdl $16, %edx, %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    bswapl %ecx
-; CHECK-NEXT:    shrdl $16, %ecx, %edx
+; CHECK-NEXT:    shldl $16, %edx, %edi
+; CHECK-NEXT:    movl %edi, 20(%eax)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    bswapl %edi
-; CHECK-NEXT:    shrdl $16, %edi, %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %ecx, 60(%eax)
-; CHECK-NEXT:    movl %edx, 56(%eax)
-; CHECK-NEXT:    movl %esi, 52(%eax)
-; CHECK-NEXT:    movl %ebx, 48(%eax)
-; CHECK-NEXT:    movl %ebp, 44(%eax)
-; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 40(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 36(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 32(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 28(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 24(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 20(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 16(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 12(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, (%eax)
-; CHECK-NEXT:    shrl $16, %edi
-; CHECK-NEXT:    movw %di, 64(%eax)
-; CHECK-NEXT:    addl $44, %esp
+; CHECK-NEXT:    shldl $16, %edi, %edx
+; CHECK-NEXT:    movl %edx, 16(%eax)
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    shldl $16, %esi, %edi
+; CHECK-NEXT:    movl %edi, 12(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shldl $16, %edi, %esi
+; CHECK-NEXT:    movl %esi, 8(%eax)
+; CHECK-NEXT:    bswapl %edx
+; CHECK-NEXT:    shldl $16, %edx, %edi
+; CHECK-NEXT:    movl %edi, 4(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    shrdl $16, %edx, %esi
+; CHECK-NEXT:    movl %esi, (%eax)
+; CHECK-NEXT:    shrl $16, %ecx
+; CHECK-NEXT:    movw %cx, 64(%eax)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
-; CHECK-NEXT:    popl %ebx
-; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl $4
 ;
 ; CHECK64-LABEL: large_promotion:
@@ -396,8 +367,8 @@ define i32 @pr55484(i32 %0) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl $8, %ecx
-; CHECK-NEXT:    shll $8, %eax
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    shrl $8, %eax
 ; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    cwtl
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/bswap_tree2.ll b/llvm/test/CodeGen/X86/bswap_tree2.ll
index 98b51460207c3..09549dc3049c4 100644
--- a/llvm/test/CodeGen/X86/bswap_tree2.ll
+++ b/llvm/test/CodeGen/X86/bswap_tree2.ll
@@ -50,15 +50,15 @@
 define i32 @test2(i32 %x) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    shrl $8, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shll $8, %ecx
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $65280, %edx # imm = 0xFF00
 ; CHECK-NEXT:    andl $-16777216, %ecx # imm = 0xFF000000
-; CHECK-NEXT:    andl $16711680, %eax # imm = 0xFF0000
-; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    shrl $8, %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    andl $16711680, %edx # imm = 0xFF0000
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    andl $65280, %eax # imm = 0xFF00
 ; CHECK-NEXT:    orl %edx, %eax
 ; CHECK-NEXT:    retl
 ;
@@ -101,16 +101,16 @@ define i32 @test3(float %x) nounwind {
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fistpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldcw (%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    shll $8, %edx
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    shrl $8, %eax
-; CHECK-NEXT:    andl $65280, %ecx # imm = 0xFF00
-; CHECK-NEXT:    andl $-16777216, %edx # imm = 0xFF000000
-; CHECK-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    andl $-16777216, %ecx # imm = 0xFF000000
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    shrl $8, %edx
+; CHECK-NEXT:    andl $16711680, %edx # imm = 0xFF0000
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    andl $65280, %eax # imm = 0xFF00
 ; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll
index 3a792abda1230..d2d7fd34f56eb 100644
--- a/llvm/test/CodeGen/X86/bt.ll
+++ b/llvm/test/CodeGen/X86/bt.ll
@@ -26,7 +26,7 @@ define void @test2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB0_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -62,7 +62,7 @@ define void @test2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB1_1
 ; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
@@ -100,7 +100,7 @@ define void @atest2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB2_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -136,7 +136,7 @@ define void @atest2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB3_1
 ; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
@@ -174,7 +174,7 @@ define void @test3(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB4_1
 ; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
@@ -212,7 +212,7 @@ define void @test3b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB5_1
 ; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
@@ -250,7 +250,7 @@ define void @testne2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB6_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -286,7 +286,7 @@ define void @testne2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB7_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -322,7 +322,7 @@ define void @atestne2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB8_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -358,7 +358,7 @@ define void @atestne2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB9_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -394,7 +394,7 @@ define void @testne3(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB10_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -430,7 +430,7 @@ define void @testne3b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB11_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -466,7 +466,7 @@ define void @query2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB12_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -502,7 +502,7 @@ define void @query2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB13_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -538,7 +538,7 @@ define void @aquery2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB14_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -574,7 +574,7 @@ define void @aquery2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB15_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -610,7 +610,7 @@ define void @query3(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB16_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -646,7 +646,7 @@ define void @query3b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB17_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -682,7 +682,7 @@ define void @query3x(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB18_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -718,7 +718,7 @@ define void @query3bx(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jae .LBB19_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -754,7 +754,7 @@ define void @queryne2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB20_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -790,7 +790,7 @@ define void @queryne2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB21_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -826,7 +826,7 @@ define void @aqueryne2(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB22_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -862,7 +862,7 @@ define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB23_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -898,7 +898,7 @@ define void @queryne3(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB24_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -934,7 +934,7 @@ define void @queryne3b(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB25_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -970,7 +970,7 @@ define void @queryne3x(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB26_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -1006,7 +1006,7 @@ define void @queryne3bx(i32 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    jb .LBB27_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    calll foo at PLT
@@ -1044,7 +1044,7 @@ define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
@@ -1065,7 +1065,7 @@ define zeroext i1 @extend(i32 %bit, i64 %bits) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
@@ -1092,18 +1092,18 @@ define void @demanded_i32(ptr nocapture readonly, ptr nocapture, i32) nounwind {
 ; X86-LABEL: demanded_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $5, %eax
-; X86-NEXT:    movl (%edx,%eax,4), %esi
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $5, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi,%edx,4), %esi
 ; X86-NEXT:    btl %ecx, %esi
 ; X86-NEXT:    jae .LBB30_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %edx, (%ecx,%eax,4)
+; X86-NEXT:    orl %eax, (%ecx,%edx,4)
 ; X86-NEXT:  .LBB30_2:
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -1150,8 +1150,8 @@ define zeroext i1 @demanded_with_known_zeroes(i32 %bit, i32 %bits) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $2, %al
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll
index efd9d1105d975..e3e118adba2c1 100644
--- a/llvm/test/CodeGen/X86/btc_bts_btr.ll
+++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll
@@ -16,8 +16,8 @@ define i16 @btr_16(i16 %x, i16 %n) {
 ;
 ; X86-LABEL: btr_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btrw %cx, %ax
 ; X86-NEXT:    retl
   %1 = shl i16 1, %n
@@ -77,8 +77,8 @@ define i32 @btr_32(i32 %x, i32 %n) {
 ;
 ; X86-LABEL: btr_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btrl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = shl i32 1, %n
@@ -96,8 +96,8 @@ define i32 @bts_32(i32 %x, i32 %n) {
 ;
 ; X86-LABEL: bts_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btsl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = shl i32 1, %n
@@ -114,8 +114,8 @@ define i32 @btc_32(i32 %x, i32 %n) {
 ;
 ; X86-LABEL: btc_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btcl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = shl i32 1, %n
@@ -132,21 +132,36 @@ define i64 @btr_64(i64 %x, i64 %n) {
 ;
 ; X86-LABEL: btr_64:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB6_2
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    jne .LBB6_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:  .LBB6_2:
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    notl %eax
+; X86-NEXT:    jne .LBB6_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB6_4:
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = shl i64 1, %n
   %2 = xor i64 %1, -1
@@ -223,8 +238,8 @@ define i16 @btr_16_mask(i16 %x, i16 %n) {
 ;
 ; X86-LABEL: btr_16_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btrw %cx, %ax
 ; X86-NEXT:    retl
   %1 = and i16 %n, 15
@@ -291,8 +306,8 @@ define i32 @btr_32_mask(i32 %x, i32 %n) {
 ;
 ; X86-LABEL: btr_32_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btrl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = and i32 %n, 31
@@ -311,8 +326,8 @@ define i32 @bts_32_mask(i32 %x, i32 %n) {
 ;
 ; X86-LABEL: bts_32_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btsl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = and i32 %n, 31
@@ -330,8 +345,8 @@ define i32 @btc_32_mask(i32 %x, i32 %n) {
 ;
 ; X86-LABEL: btc_32_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    btcl %ecx, %eax
 ; X86-NEXT:    retl
   %1 = and i32 %n, 31
@@ -349,21 +364,36 @@ define i64 @btr_64_mask(i64 %x, i64 %n) {
 ;
 ; X86-LABEL: btr_64_mask:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB15_2
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    jne .LBB15_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:  .LBB15_2:
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    notl %eax
+; X86-NEXT:    jne .LBB15_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB15_4:
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = and i64 %n, 63
   %2 = shl i64 1, %1
@@ -466,11 +496,11 @@ define i16 @bts_16_load(ptr %x, i16 %n) {
 ;
 ; X86-LABEL: bts_16_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    orw (%edx), %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orw (%ecx), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
   %1 = load i16, ptr %x
@@ -492,11 +522,11 @@ define i16 @btc_16_load(ptr %x, i16 %n) {
 ;
 ; X86-LABEL: btc_16_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    xorw (%edx), %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorw (%ecx), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
   %1 = load i16, ptr %x
@@ -575,26 +605,36 @@ define i64 @btr_64_load(ptr %x, i64 %n) {
 ;
 ; X86-LABEL: btr_64_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB24_2
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    jne .LBB24_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:  .LBB24_2:
 ; X86-NEXT:    notl %edx
+; X86-NEXT:    jne .LBB24_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB24_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl 4(%ecx), %edx
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    andl 4(%esi), %edx
-; X86-NEXT:    andl (%esi), %eax
+; X86-NEXT:    andl (%ecx), %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = load i64, ptr %x
@@ -613,10 +653,6 @@ define i64 @bts_64_load(ptr %x, i64 %n) {
 ;
 ; X86-LABEL: bts_64_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    xorl %edx, %edx
@@ -628,10 +664,9 @@ define i64 @bts_64_load(ptr %x, i64 %n) {
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB25_2:
-; X86-NEXT:    orl 4(%esi), %edx
-; X86-NEXT:    orl (%esi), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl 4(%ecx), %edx
+; X86-NEXT:    orl (%ecx), %eax
 ; X86-NEXT:    retl
   %1 = load i64, ptr %x
   %2 = shl i64 1, %n
@@ -648,10 +683,6 @@ define i64 @btc_64_load(ptr %x, i64 %n) {
 ;
 ; X86-LABEL: btc_64_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    xorl %edx, %edx
@@ -663,10 +694,9 @@ define i64 @btc_64_load(ptr %x, i64 %n) {
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB26_2:
-; X86-NEXT:    xorl 4(%esi), %edx
-; X86-NEXT:    xorl (%esi), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl 4(%ecx), %edx
+; X86-NEXT:    xorl (%ecx), %eax
 ; X86-NEXT:    retl
   %1 = load i64, ptr %x
   %2 = shl i64 1, %n
@@ -690,11 +720,11 @@ define void @btr_16_dont_fold(ptr %x, i16 %n) {
 ;
 ; X86-LABEL: btr_16_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw $-2, %dx
-; X86-NEXT:    rolw %cl, %dx
-; X86-NEXT:    andw %dx, (%eax)
+; X86-NEXT:    movw $-2, %ax
+; X86-NEXT:    rolw %cl, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andw %ax, (%ecx)
 ; X86-NEXT:    retl
   %1 = load i16, ptr %x
   %2 = shl i16 1, %n
@@ -716,11 +746,11 @@ define void @bts_16_dont_fold(ptr %x, i16 %n) {
 ;
 ; X86-LABEL: bts_16_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orw %dx, (%eax)
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orw %ax, (%ecx)
 ; X86-NEXT:    retl
   %1 = load i16, ptr %x
   %2 = shl i16 1, %n
@@ -741,11 +771,11 @@ define void @btc_16_dont_fold(ptr %x, i16 %n) {
 ;
 ; X86-LABEL: btc_16_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    xorw %dx, (%eax)
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorw %ax, (%ecx)
 ; X86-NEXT:    retl
   %1 = load i16, ptr %x
   %2 = shl i16 1, %n
@@ -766,11 +796,11 @@ define void @btr_32_dont_fold(ptr %x, i32 %n) {
 ;
 ; X86-LABEL: btr_32_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $-2, %edx
-; X86-NEXT:    roll %cl, %edx
-; X86-NEXT:    andl %edx, (%eax)
+; X86-NEXT:    movl $-2, %eax
+; X86-NEXT:    roll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, (%ecx)
 ; X86-NEXT:    retl
   %1 = load i32, ptr %x
   %2 = shl i32 1, %n
@@ -792,11 +822,11 @@ define void @bts_32_dont_fold(ptr %x, i32 %n) {
 ;
 ; X86-LABEL: bts_32_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orl %edx, (%eax)
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, (%ecx)
 ; X86-NEXT:    retl
   %1 = load i32, ptr %x
   %2 = shl i32 1, %n
@@ -817,11 +847,11 @@ define void @btc_32_dont_fold(ptr %x, i32 %n) {
 ;
 ; X86-LABEL: btc_32_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    xorl %edx, (%eax)
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, (%ecx)
 ; X86-NEXT:    retl
   %1 = load i32, ptr %x
   %2 = shl i32 1, %n
@@ -842,27 +872,22 @@ define void @btr_64_dont_fold(ptr %x, i64 %n) {
 ;
 ; X86-LABEL: btr_64_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB33_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB33_2:
-; X86-NEXT:    notl %esi
+; X86-NEXT:    notl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, (%ecx)
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %edx, (%eax)
-; X86-NEXT:    andl %esi, 4(%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    andl %edx, 4(%ecx)
 ; X86-NEXT:    retl
   %1 = load i64, ptr %x
   %2 = shl i64 1, %n
@@ -884,25 +909,20 @@ define void @bts_64_dont_fold(ptr %x, i64 %n) {
 ;
 ; X86-LABEL: bts_64_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB34_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB34_2:
-; X86-NEXT:    orl %edx, (%eax)
-; X86-NEXT:    orl %esi, 4(%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, (%ecx)
+; X86-NEXT:    orl %edx, 4(%ecx)
 ; X86-NEXT:    retl
   %1 = load i64, ptr %x
   %2 = shl i64 1, %n
@@ -923,25 +943,20 @@ define void @btc_64_dont_fold(ptr %x, i64 %n) {
 ;
 ; X86-LABEL: btc_64_dont_fold:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB35_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB35_2:
-; X86-NEXT:    xorl %edx, (%eax)
-; X86-NEXT:    xorl %esi, 4(%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, (%ecx)
+; X86-NEXT:    xorl %edx, 4(%ecx)
 ; X86-NEXT:    retl
   %1 = load i64, ptr %x
   %2 = shl i64 1, %n
@@ -1027,22 +1042,37 @@ define i64 @btr_64_mask_zeros(i64 %x, i64 %n) {
 ;
 ; X86-LABEL: btr_64_mask_zeros:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $2, %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB39_2
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    jne .LBB39_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:  .LBB39_2:
 ; X86-NEXT:    notl %edx
-; X86-NEXT:    notl %eax
+; X86-NEXT:    jne .LBB39_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:  .LBB39_4:
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %1 = shl i64 %n, 2
   %2 = and i64 %1, 63
diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll
index 51d3a65be5236..4b68f520229d2 100644
--- a/llvm/test/CodeGen/X86/build-vector-128.ll
+++ b/llvm/test/CodeGen/X86/build-vector-128.ll
@@ -704,8 +704,8 @@ define <4 x float> @test_buildvector_4f32_2_load(ptr %p0, ptr %p1) {
 ; SSE2-32-LABEL: test_buildvector_4f32_2_load:
 ; SSE2-32:       # %bb.0:
 ; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE2-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE2-32-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
@@ -726,8 +726,8 @@ define <4 x float> @test_buildvector_4f32_2_load(ptr %p0, ptr %p1) {
 ; SSE41-32-LABEL: test_buildvector_4f32_2_load:
 ; SSE41-32:       # %bb.0:
 ; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE41-32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
@@ -746,8 +746,8 @@ define <4 x float> @test_buildvector_4f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-LABEL: test_buildvector_4f32_2_load:
 ; AVX-32:       # %bb.0:
 ; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vbroadcastss (%eax), %xmm1
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -860,9 +860,9 @@ define <8 x i16> @test_buildvector_8i16_2_load(ptr %p0, ptr %p1) {
 ; SSE2-32-LABEL: test_buildvector_8i16_2_load:
 ; SSE2-32:       # %bb.0:
 ; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-32-NEXT:    movzwl (%ecx), %ecx
-; SSE2-32-NEXT:    movd %ecx, %xmm1
+; SSE2-32-NEXT:    movzwl (%eax), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm1
+; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-32-NEXT:    movzwl (%eax), %eax
 ; SSE2-32-NEXT:    movd %eax, %xmm0
 ; SSE2-32-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
@@ -889,9 +889,9 @@ define <8 x i16> @test_buildvector_8i16_2_load(ptr %p0, ptr %p1) {
 ;
 ; SSE41-32-LABEL: test_buildvector_8i16_2_load:
 ; SSE41-32:       # %bb.0:
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE41-32-NEXT:    movzwl (%eax), %eax
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE41-32-NEXT:    movzwl (%ecx), %ecx
 ; SSE41-32-NEXT:    movd %ecx, %xmm0
 ; SSE41-32-NEXT:    pinsrw $1, %eax, %xmm0
@@ -920,17 +920,17 @@ define <8 x i16> @test_buildvector_8i16_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-LABEL: test_buildvector_8i16_2_load:
 ; AVX-32:       # %bb.0:
 ; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    movzwl (%eax), %eax
 ; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-32-NEXT:    movzwl (%ecx), %ecx
-; AVX-32-NEXT:    movzwl (%eax), %eax
-; AVX-32-NEXT:    vmovd %eax, %xmm0
-; AVX-32-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX-32-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vmovd %ecx, %xmm0
+; AVX-32-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX-32-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; AVX-32-NEXT:    retl
 ;
 ; AVX-64-LABEL: test_buildvector_8i16_2_load:
@@ -1108,9 +1108,9 @@ define <16 x i8> @test_buildvector_16i8_2_load(ptr %p0, ptr %p1) {
 ; SSE2-32-LABEL: test_buildvector_16i8_2_load:
 ; SSE2-32:       # %bb.0:
 ; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-32-NEXT:    movzbl (%ecx), %ecx
-; SSE2-32-NEXT:    movd %ecx, %xmm0
+; SSE2-32-NEXT:    movzbl (%eax), %eax
+; SSE2-32-NEXT:    movd %eax, %xmm0
+; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-32-NEXT:    movzbl (%eax), %eax
 ; SSE2-32-NEXT:    movd %eax, %xmm2
 ; SSE2-32-NEXT:    movdqa %xmm2, %xmm1
@@ -1156,8 +1156,8 @@ define <16 x i8> @test_buildvector_16i8_2_load(ptr %p0, ptr %p1) {
 ; SSE41-32-LABEL: test_buildvector_16i8_2_load:
 ; SSE41-32:       # %bb.0:
 ; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE41-32-NEXT:    movzbl (%ecx), %ecx
+; SSE41-32-NEXT:    movzbl (%eax), %ecx
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE41-32-NEXT:    movzbl (%eax), %eax
 ; SSE41-32-NEXT:    movd %eax, %xmm0
 ; SSE41-32-NEXT:    pinsrb $1, %ecx, %xmm0
@@ -1202,8 +1202,8 @@ define <16 x i8> @test_buildvector_16i8_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-LABEL: test_buildvector_16i8_2_load:
 ; AVX-32:       # %bb.0:
 ; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT:    movzbl (%ecx), %ecx
+; AVX-32-NEXT:    movzbl (%eax), %ecx
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    movzbl (%eax), %eax
 ; AVX-32-NEXT:    vmovd %eax, %xmm0
 ; AVX-32-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
@@ -1410,7 +1410,6 @@ define <4 x float> @PR37502(float %x, float %y) {
 define void @pr60168_buildvector_of_zeros_and_undef(<2 x i32> %x, ptr %out) {
 ; SSE2-32-LABEL: pr60168_buildvector_of_zeros_and_undef:
 ; SSE2-32:       # %bb.0:
-; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-32-NEXT:    paddd %xmm0, %xmm0
 ; SSE2-32-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-32-NEXT:    psubd %xmm0, %xmm1
@@ -1418,6 +1417,7 @@ define void @pr60168_buildvector_of_zeros_and_undef(<2 x i32> %x, ptr %out) {
 ; SSE2-32-NEXT:    psrad $31, %xmm0
 ; SSE2-32-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-32-NEXT:    psubd %xmm0, %xmm1
+; SSE2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-32-NEXT:    movq %xmm1, (%eax)
 ; SSE2-32-NEXT:    retl
 ;
@@ -1435,11 +1435,11 @@ define void @pr60168_buildvector_of_zeros_and_undef(<2 x i32> %x, ptr %out) {
 ;
 ; SSE41-32-LABEL: pr60168_buildvector_of_zeros_and_undef:
 ; SSE41-32:       # %bb.0:
-; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE41-32-NEXT:    paddd %xmm0, %xmm0
 ; SSE41-32-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-32-NEXT:    psubd %xmm0, %xmm1
 ; SSE41-32-NEXT:    pabsd %xmm1, %xmm0
+; SSE41-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE41-32-NEXT:    movq %xmm0, (%eax)
 ; SSE41-32-NEXT:    retl
 ;
@@ -1454,11 +1454,11 @@ define void @pr60168_buildvector_of_zeros_and_undef(<2 x i32> %x, ptr %out) {
 ;
 ; AVX-32-LABEL: pr60168_buildvector_of_zeros_and_undef:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX-32-NEXT:    vpabsd %xmm0, %xmm0
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vmovq %xmm0, (%eax)
 ; AVX-32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 773eb8f6742e5..54485516b0515 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -453,8 +453,8 @@ define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) {
 ; AVX1-32-LABEL: test_buildvector_4f64_2_load:
 ; AVX1-32:       # %bb.0:
 ; AVX1-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX1-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX1-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-32-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
 ; AVX1-32-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -473,8 +473,8 @@ define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) {
 ; AVX2-32-LABEL: test_buildvector_4f64_2_load:
 ; AVX2-32:       # %bb.0:
 ; AVX2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX2-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX2-32-NEXT:    vbroadcastsd (%ecx), %ymm0
+; AVX2-32-NEXT:    vbroadcastsd (%eax), %ymm0
+; AVX2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX2-32-NEXT:    vbroadcastsd (%eax), %ymm1
 ; AVX2-32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
 ; AVX2-32-NEXT:    retl
@@ -535,8 +535,8 @@ define <8 x float> @test_buildvector_8f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-LABEL: test_buildvector_8f32_2_load:
 ; AVX-32:       # %bb.0:
 ; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vbroadcastss (%eax), %xmm1
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm1[0,1,2],xmm0[0]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -674,9 +674,9 @@ define <16 x i16> @test_buildvector_16i16_2_var(i16 %a0, i16 %a1) {
 define <16 x i16> @test_buildvector_16i16_2_load(ptr %p0, ptr %p1) {
 ; AVX1-32-LABEL: test_buildvector_16i16_2_load:
 ; AVX1-32:       # %bb.0:
-; AVX1-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX1-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX1-32-NEXT:    movzwl (%eax), %eax
+; AVX1-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX1-32-NEXT:    movzwl (%ecx), %ecx
 ; AVX1-32-NEXT:    vmovd %ecx, %xmm0
 ; AVX1-32-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
@@ -722,9 +722,9 @@ define <16 x i16> @test_buildvector_16i16_2_load(ptr %p0, ptr %p1) {
 ;
 ; AVX2-32-LABEL: test_buildvector_16i16_2_load:
 ; AVX2-32:       # %bb.0:
-; AVX2-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX2-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX2-32-NEXT:    movzwl (%eax), %eax
+; AVX2-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX2-32-NEXT:    movzwl (%ecx), %ecx
 ; AVX2-32-NEXT:    vmovd %ecx, %xmm0
 ; AVX2-32-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index ff54288cfa5e0..4ce6b67de6399 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -482,9 +482,9 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
 ; AVX512F-32-LABEL: test_buildvector_8f64_2_var:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
 ; AVX512F-32-NEXT:    movb $-126, %al
 ; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
 ; AVX512F-32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
 ; AVX512F-32-NEXT:    retl
 ;
@@ -498,9 +498,9 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
 ;
 ; AVX512BW-32-LABEL: test_buildvector_8f64_2_var:
 ; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
 ; AVX512BW-32-NEXT:    movb $-126, %al
 ; AVX512BW-32-NEXT:    kmovd %eax, %k1
+; AVX512BW-32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
 ; AVX512BW-32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
 ; AVX512BW-32-NEXT:    retl
 ;
@@ -525,11 +525,11 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
 define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) {
 ; AVX512F-32-LABEL: test_buildvector_8f64_2_load:
 ; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    movb $-126, %al
+; AVX512F-32-NEXT:    kmovw %eax, %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vbroadcastsd (%eax), %zmm0
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    vbroadcastsd (%ecx), %zmm0
-; AVX512F-32-NEXT:    movb $-126, %cl
-; AVX512F-32-NEXT:    kmovw %ecx, %k1
 ; AVX512F-32-NEXT:    vbroadcastsd (%eax), %zmm0 {%k1}
 ; AVX512F-32-NEXT:    retl
 ;
@@ -543,11 +543,11 @@ define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) {
 ;
 ; AVX512BW-32-LABEL: test_buildvector_8f64_2_load:
 ; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    movb $-126, %al
+; AVX512BW-32-NEXT:    kmovd %eax, %k1
+; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512BW-32-NEXT:    vbroadcastsd (%eax), %zmm0
 ; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512BW-32-NEXT:    vbroadcastsd (%ecx), %zmm0
-; AVX512BW-32-NEXT:    movb $-126, %cl
-; AVX512BW-32-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-32-NEXT:    vbroadcastsd (%eax), %zmm0 {%k1}
 ; AVX512BW-32-NEXT:    retl
 ;
@@ -622,10 +622,10 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
 define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-LABEL: test_buildvector_16f32_2_load:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-32-NEXT:    vpmovsxbd {{.*#+}} xmm0 = [0,17,0,0]
-; AVX-32-NEXT:    vbroadcastss (%ecx), %xmm1
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-32-NEXT:    vbroadcastss (%eax), %xmm1
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
diff --git a/llvm/test/CodeGen/X86/build_fp16_constant_vector.ll b/llvm/test/CodeGen/X86/build_fp16_constant_vector.ll
index 6cb449822145a..b7930837699f7 100644
--- a/llvm/test/CodeGen/X86/build_fp16_constant_vector.ll
+++ b/llvm/test/CodeGen/X86/build_fp16_constant_vector.ll
@@ -3,14 +3,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s
 
 define dso_local <32 x half> @foo(<32 x half> %a, <32 x half> %b, <32 x half> %c) {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
-; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm3, %zmm0
-; CHECK-NEXT:    vfmadd213ph %zmm2, %zmm3, %zmm1
-; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %0 = tail call fast <32 x half> @llvm.fma.v32f16(<32 x half> %a, <32 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00>, <32 x half> %c)
   %1 = tail call fast <32 x half> @llvm.fma.v32f16(<32 x half> %b, <32 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00, half 0xHBC00>, <32 x half> %c)
@@ -19,3 +11,5 @@ entry:
 }
 
 declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/byref.ll b/llvm/test/CodeGen/X86/byref.ll
index 67fa130ef9d73..9604e0cf66935 100644
--- a/llvm/test/CodeGen/X86/byref.ll
+++ b/llvm/test/CodeGen/X86/byref.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
 
 %Foo = type { i32, i32 }
@@ -8,13 +9,15 @@ declare x86_stdcallcc void @i(i32)
 ; byref does not imply a stack copy, so this should append 4 bytes,
 ; not 8.
 define void @stdcall(ptr %value) {
-; CHECK-LABEL: _stdcall:
-; CHECK: pushl 4(%esp)
-; CHECK: calll _foo_byref_stdcall_p at 4
+; CHECK-LABEL: stdcall:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    calll _foo_byref_stdcall_p at 4
+; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    calll _i at 4
+; CHECK-NEXT:    retl
   call x86_stdcallcc void @foo_byref_stdcall_p(ptr byref(%Foo) %value)
-; CHECK-NOT: %esp
-; CHECK: pushl
-; CHECK: calll _i at 4
   call x86_stdcallcc void @i(i32 0)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/byval2.ll b/llvm/test/CodeGen/X86/byval2.ll
index 77640e4bb5ac6..905879641bc84 100644
--- a/llvm/test/CodeGen/X86/byval2.ll
+++ b/llvm/test/CodeGen/X86/byval2.ll
@@ -39,17 +39,17 @@ define void @g(i64 %a, i64 %b, i64 %c) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $288, %esp # imm = 0x120
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl $34, %ecx
diff --git a/llvm/test/CodeGen/X86/byval3.ll b/llvm/test/CodeGen/X86/byval3.ll
index eef13f1f1aa27..d97968982db87 100644
--- a/llvm/test/CodeGen/X86/byval3.ll
+++ b/llvm/test/CodeGen/X86/byval3.ll
@@ -48,17 +48,17 @@ define void @g(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $288, %esp # imm = 0x120
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 20(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl $33, %ecx
diff --git a/llvm/test/CodeGen/X86/byval4.ll b/llvm/test/CodeGen/X86/byval4.ll
index bcb2818ee6e51..05609b990cd7a 100644
--- a/llvm/test/CodeGen/X86/byval4.ll
+++ b/llvm/test/CodeGen/X86/byval4.ll
@@ -53,17 +53,17 @@ define void @g(i16 signext  %a1, i16 signext  %a2, i16 signext  %a3,
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $288, %esp # imm = 0x120
-; X86-NEXT:    movzwl 8(%ebp), %eax
-; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl 12(%ebp), %eax
+; X86-NEXT:    movzwl 28(%ebp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl 16(%ebp), %eax
+; X86-NEXT:    movzwl 24(%ebp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl 20(%ebp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl 24(%ebp), %eax
+; X86-NEXT:    movzwl 16(%ebp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movzwl 28(%ebp), %eax
+; X86-NEXT:    movzwl 12(%ebp), %eax
+; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzwl 8(%ebp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl $32, %ecx
diff --git a/llvm/test/CodeGen/X86/byval5.ll b/llvm/test/CodeGen/X86/byval5.ll
index 28deafcd982f5..37f4442166c03 100644
--- a/llvm/test/CodeGen/X86/byval5.ll
+++ b/llvm/test/CodeGen/X86/byval5.ll
@@ -62,16 +62,16 @@ define void @g(i8 signext  %a1, i8 signext  %a2, i8 signext  %a3, i8 signext  %a
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $272, %esp # imm = 0x110
 ; X86-NEXT:    movzbl 28(%ebp), %eax
-; X86-NEXT:    movzbl 24(%ebp), %ecx
-; X86-NEXT:    movzbl 20(%ebp), %edx
-; X86-NEXT:    movb 16(%ebp), %ah
-; X86-NEXT:    movb 12(%ebp), %ch
-; X86-NEXT:    movb 8(%ebp), %dh
-; X86-NEXT:    movb %dh, {{[0-9]+}}(%esp)
-; X86-NEXT:    movb %ch, {{[0-9]+}}(%esp)
-; X86-NEXT:    movb %ah, {{[0-9]+}}(%esp)
-; X86-NEXT:    movb %dl, {{[0-9]+}}(%esp)
-; X86-NEXT:    movb %cl, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 24(%ebp), %eax
+; X86-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 20(%ebp), %eax
+; X86-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 16(%ebp), %eax
+; X86-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 12(%ebp), %eax
+; X86-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 8(%ebp), %eax
 ; X86-NEXT:    movb %al, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl $32, %ecx
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
index 67213b38277dc..0a84ca35c73c4 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -54,22 +54,22 @@ define float @canon_fp32_varargsf32(float %a) {
 define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) {
 ; X87-LABEL: canon_fp32_varargsf80:
 ; X87:       # %bb.0:
-; X87-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-NEXT:    fld1
+; X87-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: canon_fp32_varargsf80:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fld1
+; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fmulp %st, %st(1)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: canon_fp32_varargsf80:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    fld1
+; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    fmulp %st, %st(1)
 ; X86-AVX-NEXT:    retl
 ;
@@ -173,25 +173,24 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    fxch %st(1)
-; X87-NEXT:    fucom %st(0)
-; X87-NEXT:    fnstsw %ax
-; X87-NEXT:    fld %st(1)
+; X87-NEXT:    fld %st(0)
 ; X87-NEXT:    ja .LBB3_2
 ; X87-NEXT:  # %bb.1: # %start
 ; X87-NEXT:    fstp %st(0)
-; X87-NEXT:    fldz
-; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fld %st(1)
 ; X87-NEXT:  .LBB3_2: # %start
-; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    fxch %st(2)
+; X87-NEXT:    fucomp %st(0)
+; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
 ; X87-NEXT:    jp .LBB3_4
 ; X87-NEXT:  # %bb.3: # %start
-; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    fstp %st(0)
 ; X87-NEXT:    fldz
+; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:  .LBB3_4: # %start
-; X87-NEXT:    fstp %st(0)
+; X87-NEXT:    fstp %st(1)
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    retl
@@ -207,12 +206,12 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; X86-SSE-NEXT:    subl $8, %esp
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT:    movapd %xmm0, %xmm2
-; X86-SSE-NEXT:    cmpunordsd %xmm0, %xmm2
+; X86-SSE-NEXT:    movapd %xmm1, %xmm2
+; X86-SSE-NEXT:    cmpunordsd %xmm1, %xmm2
 ; X86-SSE-NEXT:    movapd %xmm2, %xmm3
-; X86-SSE-NEXT:    andpd %xmm1, %xmm3
-; X86-SSE-NEXT:    maxsd %xmm0, %xmm1
-; X86-SSE-NEXT:    andnpd %xmm1, %xmm2
+; X86-SSE-NEXT:    andpd %xmm0, %xmm3
+; X86-SSE-NEXT:    maxsd %xmm1, %xmm0
+; X86-SSE-NEXT:    andnpd %xmm0, %xmm2
 ; X86-SSE-NEXT:    orpd %xmm3, %xmm2
 ; X86-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE-NEXT:    movsd %xmm2, (%esp)
@@ -298,25 +297,24 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    fxch %st(1)
-; X87-NEXT:    fucom %st(0)
-; X87-NEXT:    fnstsw %ax
-; X87-NEXT:    fld %st(1)
+; X87-NEXT:    fld %st(0)
 ; X87-NEXT:    ja .LBB4_2
 ; X87-NEXT:  # %bb.1: # %start
 ; X87-NEXT:    fstp %st(0)
-; X87-NEXT:    fldz
-; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fld %st(1)
 ; X87-NEXT:  .LBB4_2: # %start
-; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    fxch %st(2)
+; X87-NEXT:    fucomp %st(0)
+; X87-NEXT:    fnstsw %ax
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
 ; X87-NEXT:    jp .LBB4_4
 ; X87-NEXT:  # %bb.3: # %start
-; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    fstp %st(0)
 ; X87-NEXT:    fldz
+; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:  .LBB4_4: # %start
-; X87-NEXT:    fstp %st(0)
+; X87-NEXT:    fstp %st(1)
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    retl
@@ -327,12 +325,12 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE-NEXT:    cmpunordss %xmm0, %xmm2
+; X86-SSE-NEXT:    movaps %xmm1, %xmm2
+; X86-SSE-NEXT:    cmpunordss %xmm1, %xmm2
 ; X86-SSE-NEXT:    movaps %xmm2, %xmm3
-; X86-SSE-NEXT:    andps %xmm1, %xmm3
-; X86-SSE-NEXT:    maxss %xmm0, %xmm1
-; X86-SSE-NEXT:    andnps %xmm1, %xmm2
+; X86-SSE-NEXT:    andps %xmm0, %xmm3
+; X86-SSE-NEXT:    maxss %xmm1, %xmm0
+; X86-SSE-NEXT:    andnps %xmm0, %xmm2
 ; X86-SSE-NEXT:    orps %xmm3, %xmm2
 ; X86-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE-NEXT:    movss %xmm2, (%esp)
@@ -580,21 +578,18 @@ define void @canonicalize_undef(double addrspace(1)* %out) {
 define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) {
 ; X87-LABEL: canon_fp32_varargsv4f32:
 ; X87:       # %bb.0:
-; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fld %st(0)
 ; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
-; X87-NEXT:    fld %st(1)
-; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
-; X87-NEXT:    fld %st(2)
-; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
-; X87-NEXT:    fxch %st(3)
-; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fstps 12(%eax)
-; X87-NEXT:    fxch %st(2)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps 8(%eax)
-; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps 4(%eax)
+; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    retl $4
 ;
@@ -636,21 +631,18 @@ define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) {
 define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) {
 ; X87-LABEL: canon_fp64_varargsv4f64:
 ; X87:       # %bb.0:
-; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fld %st(0)
 ; X87-NEXT:    fmull {{[0-9]+}}(%esp)
-; X87-NEXT:    fld %st(1)
-; X87-NEXT:    fmull {{[0-9]+}}(%esp)
-; X87-NEXT:    fld %st(2)
-; X87-NEXT:    fmull {{[0-9]+}}(%esp)
-; X87-NEXT:    fxch %st(3)
-; X87-NEXT:    fmull {{[0-9]+}}(%esp)
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fstpl 24(%eax)
-; X87-NEXT:    fxch %st(2)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmull {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl 16(%eax)
-; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmull {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl 8(%eax)
+; X87-NEXT:    fmull {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%eax)
 ; X87-NEXT:    retl $4
 ;
@@ -696,30 +688,30 @@ define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) {
 define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) {
 ; X87-LABEL: canon_fp80_varargsv2fp80:
 ; X87:       # %bb.0:
+; X87-NEXT:    fld1
 ; X87-NEXT:    fldt {{[0-9]+}}(%esp)
+; X87-NEXT:    fmul %st(1), %st
 ; X87-NEXT:    fldt {{[0-9]+}}(%esp)
-; X87-NEXT:    fld1
-; X87-NEXT:    fmul %st, %st(1)
 ; X87-NEXT:    fmulp %st, %st(2)
 ; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: canon_fp80_varargsv2fp80:
 ; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    fld1
 ; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    fmul %st(1), %st
 ; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fld1
-; X86-SSE-NEXT:    fmul %st, %st(1)
 ; X86-SSE-NEXT:    fmulp %st, %st(2)
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: canon_fp80_varargsv2fp80:
 ; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    fld1
 ; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    fmul %st(1), %st
 ; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    fld1
-; X86-AVX-NEXT:    fmul %st, %st(1)
 ; X86-AVX-NEXT:    fmulp %st, %st(2)
 ; X86-AVX-NEXT:    fxch %st(1)
 ; X86-AVX-NEXT:    retl
@@ -753,18 +745,15 @@ define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fld %st(0)
-; X87-NEXT:    fmuls (%eax)
-; X87-NEXT:    fld %st(1)
-; X87-NEXT:    fmuls 4(%eax)
-; X87-NEXT:    fld %st(2)
-; X87-NEXT:    fmuls 8(%eax)
-; X87-NEXT:    fxch %st(3)
 ; X87-NEXT:    fmuls 12(%eax)
 ; X87-NEXT:    fstps 12(%eax)
-; X87-NEXT:    fxch %st(2)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmuls 8(%eax)
 ; X87-NEXT:    fstps 8(%eax)
-; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmuls 4(%eax)
 ; X87-NEXT:    fstps 4(%eax)
+; X87-NEXT:    fmuls (%eax)
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    retl
 ;
@@ -823,18 +812,15 @@ define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 {
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fld %st(0)
-; X87-NEXT:    fmull (%eax)
-; X87-NEXT:    fld %st(1)
-; X87-NEXT:    fmull 8(%eax)
-; X87-NEXT:    fld %st(2)
-; X87-NEXT:    fmull 16(%eax)
-; X87-NEXT:    fxch %st(3)
 ; X87-NEXT:    fmull 24(%eax)
 ; X87-NEXT:    fstpl 24(%eax)
-; X87-NEXT:    fxch %st(2)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmull 16(%eax)
 ; X87-NEXT:    fstpl 16(%eax)
-; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fld %st(0)
+; X87-NEXT:    fmull 8(%eax)
 ; X87-NEXT:    fstpl 8(%eax)
+; X87-NEXT:    fmull (%eax)
 ; X87-NEXT:    fstpl (%eax)
 ; X87-NEXT:    retl
 ;
@@ -842,11 +828,11 @@ define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
-; X86-SSE-NEXT:    movapd 16(%eax), %xmm1
+; X86-SSE-NEXT:    movapd (%eax), %xmm1
 ; X86-SSE-NEXT:    mulpd %xmm0, %xmm1
-; X86-SSE-NEXT:    mulpd (%eax), %xmm0
-; X86-SSE-NEXT:    movapd %xmm0, (%eax)
-; X86-SSE-NEXT:    movapd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    movapd %xmm1, (%eax)
+; X86-SSE-NEXT:    mulpd 16(%eax), %xmm0
+; X86-SSE-NEXT:    movapd %xmm0, 16(%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: vec_canonicalize_var_v4f64:
@@ -902,19 +888,18 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 {
 ; X87:       # %bb.0:
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fldt 30(%eax)
-; X87-NEXT:    fldt 20(%eax)
-; X87-NEXT:    fldt 10(%eax)
-; X87-NEXT:    fldt (%eax)
 ; X87-NEXT:    fld1
 ; X87-NEXT:    fmul %st, %st(1)
-; X87-NEXT:    fmul %st, %st(2)
-; X87-NEXT:    fmul %st, %st(3)
-; X87-NEXT:    fmulp %st, %st(4)
-; X87-NEXT:    fxch %st(3)
-; X87-NEXT:    fstpt 30(%eax)
 ; X87-NEXT:    fxch %st(1)
+; X87-NEXT:    fstpt 30(%eax)
+; X87-NEXT:    fldt 20(%eax)
+; X87-NEXT:    fmul %st(1), %st
 ; X87-NEXT:    fstpt 20(%eax)
+; X87-NEXT:    fldt 10(%eax)
+; X87-NEXT:    fmul %st(1), %st
 ; X87-NEXT:    fstpt 10(%eax)
+; X87-NEXT:    fldt (%eax)
+; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    fstpt (%eax)
 ; X87-NEXT:    retl
 ;
@@ -922,19 +907,18 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    fldt 30(%eax)
-; X86-SSE-NEXT:    fldt 20(%eax)
-; X86-SSE-NEXT:    fldt 10(%eax)
-; X86-SSE-NEXT:    fldt (%eax)
 ; X86-SSE-NEXT:    fld1
 ; X86-SSE-NEXT:    fmul %st, %st(1)
-; X86-SSE-NEXT:    fmul %st, %st(2)
-; X86-SSE-NEXT:    fmul %st, %st(3)
-; X86-SSE-NEXT:    fmulp %st, %st(4)
-; X86-SSE-NEXT:    fxch %st(3)
-; X86-SSE-NEXT:    fstpt 30(%eax)
 ; X86-SSE-NEXT:    fxch %st(1)
+; X86-SSE-NEXT:    fstpt 30(%eax)
+; X86-SSE-NEXT:    fldt 20(%eax)
+; X86-SSE-NEXT:    fmul %st(1), %st
 ; X86-SSE-NEXT:    fstpt 20(%eax)
+; X86-SSE-NEXT:    fldt 10(%eax)
+; X86-SSE-NEXT:    fmul %st(1), %st
 ; X86-SSE-NEXT:    fstpt 10(%eax)
+; X86-SSE-NEXT:    fldt (%eax)
+; X86-SSE-NEXT:    fmulp %st, %st(1)
 ; X86-SSE-NEXT:    fstpt (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -942,19 +926,18 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 {
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    fldt 30(%eax)
-; X86-AVX-NEXT:    fldt 20(%eax)
-; X86-AVX-NEXT:    fldt 10(%eax)
-; X86-AVX-NEXT:    fldt (%eax)
 ; X86-AVX-NEXT:    fld1
 ; X86-AVX-NEXT:    fmul %st, %st(1)
-; X86-AVX-NEXT:    fmul %st, %st(2)
-; X86-AVX-NEXT:    fmul %st, %st(3)
-; X86-AVX-NEXT:    fmulp %st, %st(4)
-; X86-AVX-NEXT:    fxch %st(3)
-; X86-AVX-NEXT:    fstpt 30(%eax)
 ; X86-AVX-NEXT:    fxch %st(1)
+; X86-AVX-NEXT:    fstpt 30(%eax)
+; X86-AVX-NEXT:    fldt 20(%eax)
+; X86-AVX-NEXT:    fmul %st(1), %st
 ; X86-AVX-NEXT:    fstpt 20(%eax)
+; X86-AVX-NEXT:    fldt 10(%eax)
+; X86-AVX-NEXT:    fmul %st(1), %st
 ; X86-AVX-NEXT:    fstpt 10(%eax)
+; X86-AVX-NEXT:    fldt (%eax)
+; X86-AVX-NEXT:    fmulp %st, %st(1)
 ; X86-AVX-NEXT:    fstpt (%eax)
 ; X86-AVX-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/clear-highbits.ll b/llvm/test/CodeGen/X86/clear-highbits.ll
index 755b1094234fd..8775501c855f3 100644
--- a/llvm/test/CodeGen/X86/clear-highbits.ll
+++ b/llvm/test/CodeGen/X86/clear-highbits.ll
@@ -47,9 +47,9 @@ define i8 @clear_highbits8_c0(i8 %val, i8 %numhighbits) nounwind {
 define i8 @clear_highbits8_c2_load(ptr %w, i8 %numhighbits) nounwind {
 ; X86-LABEL: clear_highbits8_c2_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl (%eax), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shlb %cl, %al
 ; X86-NEXT:    shrb %cl, %al
 ; X86-NEXT:    retl
@@ -183,9 +183,9 @@ define i16 @clear_highbits16_c1_indexzext(i16 %val, i8 %numhighbits) nounwind {
 define i16 @clear_highbits16_c2_load(ptr %w, i16 %numhighbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits16_c2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    movzwl %ax, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
@@ -231,9 +231,9 @@ define i16 @clear_highbits16_c2_load(ptr %w, i16 %numhighbits) nounwind {
 define i16 @clear_highbits16_c3_load_indexzext(ptr %w, i8 %numhighbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits16_c3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    movzwl %ax, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
@@ -401,20 +401,20 @@ define i32 @clear_highbits32_c1_indexzext(i32 %val, i8 %numhighbits) nounwind {
 define i32 @clear_highbits32_c2_load(ptr %w, i32 %numhighbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits32_c2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_highbits32_c2_load:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl $32, %ecx
+; X86-BMI2-NEXT:    subl %eax, %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl $32, %edx
-; X86-BMI2-NEXT:    subl %ecx, %edx
-; X86-BMI2-NEXT:    bzhil %edx, (%eax), %eax
+; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_highbits32_c2_load:
@@ -441,20 +441,20 @@ define i32 @clear_highbits32_c2_load(ptr %w, i32 %numhighbits) nounwind {
 define i32 @clear_highbits32_c3_load_indexzext(ptr %w, i8 %numhighbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits32_c3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_highbits32_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl $32, %ecx
+; X86-BMI2-NEXT:    subl %eax, %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl $32, %edx
-; X86-BMI2-NEXT:    subl %ecx, %edx
-; X86-BMI2-NEXT:    bzhil %edx, (%eax), %eax
+; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_highbits32_c3_load_indexzext:
@@ -670,63 +670,57 @@ define i64 @clear_highbits64_c1_indexzext(i64 %val, i8 %numhighbits) nounwind {
 define i64 @clear_highbits64_c2_load(ptr %w, i64 %numhighbits) nounwind {
 ; X86-BASELINE-LABEL: clear_highbits64_c2_load:
 ; X86-BASELINE:       # %bb.0:
-; X86-BASELINE-NEXT:    pushl %edi
 ; X86-BASELINE-NEXT:    pushl %esi
-; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BASELINE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BASELINE-NEXT:    movl $-1, %eax
-; X86-BASELINE-NEXT:    movl $-1, %edi
-; X86-BASELINE-NEXT:    shrl %cl, %edi
+; X86-BASELINE-NEXT:    movl $-1, %esi
+; X86-BASELINE-NEXT:    shrl %cl, %esi
 ; X86-BASELINE-NEXT:    xorl %edx, %edx
 ; X86-BASELINE-NEXT:    testb $32, %cl
 ; X86-BASELINE-NEXT:    jne .LBB15_1
 ; X86-BASELINE-NEXT:  # %bb.2:
-; X86-BASELINE-NEXT:    movl %edi, %edx
+; X86-BASELINE-NEXT:    movl %esi, %edx
 ; X86-BASELINE-NEXT:    jmp .LBB15_3
 ; X86-BASELINE-NEXT:  .LBB15_1:
-; X86-BASELINE-NEXT:    movl %edi, %eax
+; X86-BASELINE-NEXT:    movl %esi, %eax
 ; X86-BASELINE-NEXT:  .LBB15_3:
-; X86-BASELINE-NEXT:    andl (%esi), %eax
-; X86-BASELINE-NEXT:    andl 4(%esi), %edx
+; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BASELINE-NEXT:    andl (%ecx), %eax
+; X86-BASELINE-NEXT:    andl 4(%ecx), %edx
 ; X86-BASELINE-NEXT:    popl %esi
-; X86-BASELINE-NEXT:    popl %edi
 ; X86-BASELINE-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: clear_highbits64_c2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl $-1, %edi
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    cmovel %eax, %edx
-; X86-BMI1-NEXT:    cmovel %edi, %eax
-; X86-BMI1-NEXT:    andl (%esi), %eax
-; X86-BMI1-NEXT:    andl 4(%esi), %edx
+; X86-BMI1-NEXT:    cmovel %esi, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    andl (%ecx), %eax
+; X86-BMI1-NEXT:    andl 4(%ecx), %edx
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_highbits64_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ebx, %eax, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    cmovel %esi, %edx
 ; X86-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_highbits64_c2_load:
@@ -753,63 +747,57 @@ define i64 @clear_highbits64_c2_load(ptr %w, i64 %numhighbits) nounwind {
 define i64 @clear_highbits64_c3_load_indexzext(ptr %w, i8 %numhighbits) nounwind {
 ; X86-BASELINE-LABEL: clear_highbits64_c3_load_indexzext:
 ; X86-BASELINE:       # %bb.0:
-; X86-BASELINE-NEXT:    pushl %edi
 ; X86-BASELINE-NEXT:    pushl %esi
-; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BASELINE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BASELINE-NEXT:    movl $-1, %eax
-; X86-BASELINE-NEXT:    movl $-1, %edi
-; X86-BASELINE-NEXT:    shrl %cl, %edi
+; X86-BASELINE-NEXT:    movl $-1, %esi
+; X86-BASELINE-NEXT:    shrl %cl, %esi
 ; X86-BASELINE-NEXT:    xorl %edx, %edx
 ; X86-BASELINE-NEXT:    testb $32, %cl
 ; X86-BASELINE-NEXT:    jne .LBB16_1
 ; X86-BASELINE-NEXT:  # %bb.2:
-; X86-BASELINE-NEXT:    movl %edi, %edx
+; X86-BASELINE-NEXT:    movl %esi, %edx
 ; X86-BASELINE-NEXT:    jmp .LBB16_3
 ; X86-BASELINE-NEXT:  .LBB16_1:
-; X86-BASELINE-NEXT:    movl %edi, %eax
+; X86-BASELINE-NEXT:    movl %esi, %eax
 ; X86-BASELINE-NEXT:  .LBB16_3:
-; X86-BASELINE-NEXT:    andl (%esi), %eax
-; X86-BASELINE-NEXT:    andl 4(%esi), %edx
+; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BASELINE-NEXT:    andl (%ecx), %eax
+; X86-BASELINE-NEXT:    andl 4(%ecx), %edx
 ; X86-BASELINE-NEXT:    popl %esi
-; X86-BASELINE-NEXT:    popl %edi
 ; X86-BASELINE-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: clear_highbits64_c3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl $-1, %edi
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    cmovel %eax, %edx
-; X86-BMI1-NEXT:    cmovel %edi, %eax
-; X86-BMI1-NEXT:    andl (%esi), %eax
-; X86-BMI1-NEXT:    andl 4(%esi), %edx
+; X86-BMI1-NEXT:    cmovel %esi, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    andl (%ecx), %eax
+; X86-BMI1-NEXT:    andl 4(%ecx), %edx
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_highbits64_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ebx, %eax, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    cmovel %esi, %edx
 ; X86-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_highbits64_c3_load_indexzext:
@@ -914,20 +902,20 @@ define i64 @clear_highbits64_c4_commutative(i64 %val, i64 %numhighbits) nounwind
 define i32 @oneuse32_c(i32 %val, i32 %numhighbits, ptr %escape) nounwind {
 ; X86-NOBMI2-LABEL: oneuse32_c:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
-; X86-NOBMI2-NEXT:    movl %eax, (%edx)
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: oneuse32_c:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shrxl %eax, %edx, %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
@@ -958,64 +946,61 @@ define i32 @oneuse32_c(i32 %val, i32 %numhighbits, ptr %escape) nounwind {
 define i64 @oneuse64_c(i64 %val, i64 %numhighbits, ptr %escape) nounwind {
 ; X86-BASELINE-LABEL: oneuse64_c:
 ; X86-BASELINE:       # %bb.0:
-; X86-BASELINE-NEXT:    pushl %esi
-; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BASELINE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BASELINE-NEXT:    movl $-1, %eax
-; X86-BASELINE-NEXT:    movl $-1, %edx
-; X86-BASELINE-NEXT:    shrl %cl, %edx
+; X86-BASELINE-NEXT:    shrl %cl, %eax
+; X86-BASELINE-NEXT:    xorl %edx, %edx
 ; X86-BASELINE-NEXT:    testb $32, %cl
-; X86-BASELINE-NEXT:    je .LBB19_2
+; X86-BASELINE-NEXT:    jne .LBB19_2
 ; X86-BASELINE-NEXT:  # %bb.1:
-; X86-BASELINE-NEXT:    movl %edx, %eax
-; X86-BASELINE-NEXT:    xorl %edx, %edx
+; X86-BASELINE-NEXT:    movl %eax, %edx
 ; X86-BASELINE-NEXT:  .LBB19_2:
-; X86-BASELINE-NEXT:    movl %edx, 4(%esi)
-; X86-BASELINE-NEXT:    movl %eax, (%esi)
+; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BASELINE-NEXT:    movl %edx, 4(%ecx)
+; X86-BASELINE-NEXT:    jne .LBB19_4
+; X86-BASELINE-NEXT:  # %bb.3:
+; X86-BASELINE-NEXT:    movl $-1, %eax
+; X86-BASELINE-NEXT:  .LBB19_4:
+; X86-BASELINE-NEXT:    movl %eax, (%ecx)
 ; X86-BASELINE-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BASELINE-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BASELINE-NEXT:    popl %esi
 ; X86-BASELINE-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: oneuse64_c:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %eax
-; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    cmovnel %edi, %eax
-; X86-BMI1-NEXT:    cmovel %edi, %edx
-; X86-BMI1-NEXT:    movl %edx, 4(%esi)
-; X86-BMI1-NEXT:    movl %eax, (%esi)
+; X86-BMI1-NEXT:    cmovel %eax, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI1-NEXT:    cmovel %esi, %eax
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: oneuse64_c:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ebx, %eax, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    testb $32, %bl
-; X86-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    cmovel %esi, %edx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI2-NEXT:    cmovnel %esi, %eax
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: oneuse64_c:
@@ -1044,21 +1029,21 @@ define i64 @oneuse64_c(i64 %val, i64 %numhighbits, ptr %escape) nounwind {
 define i32 @oneuse32_d(i32 %val, i32 %numhighbits, ptr %escape) nounwind {
 ; X86-NOBMI2-LABEL: oneuse32_d:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %eax, (%edx)
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: oneuse32_d:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shlxl %ecx, {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl %edx, (%eax)
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shlxl %eax, {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: oneuse32_d:
@@ -1090,36 +1075,34 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind {
 ; X86-BASELINE-NEXT:    pushl %edi
 ; X86-BASELINE-NEXT:    pushl %esi
 ; X86-BASELINE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BASELINE-NEXT:    movl %edx, %edi
+; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BASELINE-NEXT:    movl %eax, %edi
 ; X86-BASELINE-NEXT:    shll %cl, %edi
-; X86-BASELINE-NEXT:    shldl %cl, %edx, %eax
+; X86-BASELINE-NEXT:    shldl %cl, %eax, %esi
+; X86-BASELINE-NEXT:    xorl %edx, %edx
 ; X86-BASELINE-NEXT:    testb $32, %cl
-; X86-BASELINE-NEXT:    movl %edi, %esi
-; X86-BASELINE-NEXT:    jne .LBB21_2
+; X86-BASELINE-NEXT:    je .LBB21_2
 ; X86-BASELINE-NEXT:  # %bb.1:
-; X86-BASELINE-NEXT:    movl %eax, %esi
+; X86-BASELINE-NEXT:    movl %edi, %esi
+; X86-BASELINE-NEXT:    xorl %edi, %edi
 ; X86-BASELINE-NEXT:  .LBB21_2:
 ; X86-BASELINE-NEXT:    movl %esi, %eax
 ; X86-BASELINE-NEXT:    shrl %cl, %eax
-; X86-BASELINE-NEXT:    xorl %ebx, %ebx
 ; X86-BASELINE-NEXT:    testb $32, %cl
-; X86-BASELINE-NEXT:    movl $0, %edx
 ; X86-BASELINE-NEXT:    jne .LBB21_4
 ; X86-BASELINE-NEXT:  # %bb.3:
-; X86-BASELINE-NEXT:    movl %edi, %ebx
 ; X86-BASELINE-NEXT:    movl %eax, %edx
 ; X86-BASELINE-NEXT:  .LBB21_4:
-; X86-BASELINE-NEXT:    movl %ebx, %edi
-; X86-BASELINE-NEXT:    shrdl %cl, %esi, %edi
+; X86-BASELINE-NEXT:    movl %edi, %ebx
+; X86-BASELINE-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BASELINE-NEXT:    testb $32, %cl
 ; X86-BASELINE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BASELINE-NEXT:    movl %ebx, (%ecx)
+; X86-BASELINE-NEXT:    movl %edi, (%ecx)
 ; X86-BASELINE-NEXT:    movl %esi, 4(%ecx)
 ; X86-BASELINE-NEXT:    jne .LBB21_6
 ; X86-BASELINE-NEXT:  # %bb.5:
-; X86-BASELINE-NEXT:    movl %edi, %eax
+; X86-BASELINE-NEXT:    movl %ebx, %eax
 ; X86-BASELINE-NEXT:  .LBB21_6:
 ; X86-BASELINE-NEXT:    popl %esi
 ; X86-BASELINE-NEXT:    popl %edi
@@ -1137,13 +1120,13 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind {
 ; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    shldl %cl, %edx, %esi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    cmovnel %eax, %esi
+; X86-BMI1-NEXT:    cmovnel %edx, %eax
 ; X86-BMI1-NEXT:    movl %esi, %edi
 ; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    cmovnel %edx, %eax
 ; X86-BMI1-NEXT:    cmovel %edi, %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI1-NEXT:    movl %eax, (%ebx)
@@ -1166,12 +1149,13 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind {
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    cmovnel %eax, %esi
-; X86-BMI2-NEXT:    cmovnel %edx, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI2-NEXT:    cmovel %edi, %edx
+; X86-BMI2-NEXT:    movl %edi, %edx
+; X86-BMI2-NEXT:    cmovnel %ebx, %edx
+; X86-BMI2-NEXT:    cmovnel %ebx, %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movl %eax, (%ebx)
 ; X86-BMI2-NEXT:    shrdl %cl, %esi, %eax
@@ -1216,9 +1200,9 @@ define i64 @oneusei64_d(i64 %val, i64 %numhighbits, ptr %escape) nounwind {
 define i32 @clear_highbits32_16(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits32_16:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
@@ -1257,9 +1241,9 @@ define i32 @clear_highbits32_16(i32 %val, i32 %numlowbits) nounwind {
 define i32 @clear_highbits32_48(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits32_48:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movb $48, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
@@ -1299,22 +1283,22 @@ define i32 @clear_highbits32_48(i32 %val, i32 %numlowbits) nounwind {
 define i32 @clear_highbits32_16_extrause(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits32_16_extrause:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
-; X86-NOBMI2-NEXT:    movl %eax, (%edx)
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_highbits32_16_extrause:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movb $16, %al
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shrxl %eax, %edx, %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
@@ -1347,22 +1331,22 @@ define i32 @clear_highbits32_16_extrause(i32 %val, i32 %numlowbits, ptr %escape)
 define i32 @clear_highbits32_48_extrause(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI2-LABEL: clear_highbits32_48_extrause:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $48, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
-; X86-NOBMI2-NEXT:    movl %eax, (%edx)
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_highbits32_48_extrause:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movb $48, %al
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shrxl %eax, %edx, %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/clear-lowbits.ll b/llvm/test/CodeGen/X86/clear-lowbits.ll
index 49ea2d0f1ed7a..f5c2e1a0f4abf 100644
--- a/llvm/test/CodeGen/X86/clear-lowbits.ll
+++ b/llvm/test/CodeGen/X86/clear-lowbits.ll
@@ -49,9 +49,9 @@ define i8 @clear_lowbits8_c0(i8 %val, i8 %numlowbits) nounwind {
 define i8 @clear_lowbits8_c2_load(ptr %w, i8 %numlowbits) nounwind {
 ; X86-LABEL: clear_lowbits8_c2_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl (%eax), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrb %cl, %al
 ; X86-NEXT:    shlb %cl, %al
 ; X86-NEXT:    retl
@@ -107,10 +107,10 @@ define i16 @clear_lowbits16_c0(i16 %val, i16 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_c0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -148,10 +148,10 @@ define i16 @clear_lowbits16_c1_indexzext(i16 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_c1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -181,9 +181,9 @@ define i16 @clear_lowbits16_c1_indexzext(i16 %val, i8 %numlowbits) nounwind {
 define i16 @clear_lowbits16_c2_load(ptr %w, i16 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_c2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -225,9 +225,9 @@ define i16 @clear_lowbits16_c2_load(ptr %w, i16 %numlowbits) nounwind {
 define i16 @clear_lowbits16_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_c3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -279,10 +279,10 @@ define i16 @clear_lowbits16_c4_commutative(i16 %val, i16 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_c4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -384,19 +384,19 @@ define i32 @clear_lowbits32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @clear_lowbits32_c2_load(ptr %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits32_c2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits32_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits32_c2_load:
@@ -422,19 +422,19 @@ define i32 @clear_lowbits32_c2_load(ptr %w, i32 %numlowbits) nounwind {
 define i32 @clear_lowbits32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits32_c3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movl (%eax), %eax
+; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits32_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits32_c3_load_indexzext:
@@ -602,8 +602,6 @@ define i64 @clear_lowbits64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @clear_lowbits64_c2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits64_c2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
@@ -614,27 +612,25 @@ define i64 @clear_lowbits64_c2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB15_2:
-; X86-NOBMI2-NEXT:    andl (%esi), %eax
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    andl (%ecx), %eax
+; X86-NOBMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB15_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB15_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    andl 4(%ecx), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c2_load:
@@ -660,8 +656,6 @@ define i64 @clear_lowbits64_c2_load(ptr %w, i64 %numlowbits) nounwind {
 define i64 @clear_lowbits64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits64_c3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
@@ -672,27 +666,25 @@ define i64 @clear_lowbits64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB16_2:
-; X86-NOBMI2-NEXT:    andl (%esi), %eax
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    andl (%ecx), %eax
+; X86-NOBMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB16_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB16_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    andl 4(%ecx), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_c3_load_indexzext:
@@ -777,9 +769,9 @@ define i64 @clear_lowbits64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 define i8 @clear_lowbits8_ic0(i8 %val, i8 %numlowbits) nounwind {
 ; X86-LABEL: clear_lowbits8_ic0:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $8, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrb %cl, %al
 ; X86-NEXT:    shlb %cl, %al
 ; X86-NEXT:    retl
@@ -802,10 +794,10 @@ define i8 @clear_lowbits8_ic0(i8 %val, i8 %numlowbits) nounwind {
 define i8 @clear_lowbits8_ic2_load(ptr %w, i8 %numlowbits) nounwind {
 ; X86-LABEL: clear_lowbits8_ic2_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    movb $8, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    shrb %cl, %al
 ; X86-NEXT:    shlb %cl, %al
 ; X86-NEXT:    retl
@@ -828,9 +820,9 @@ define i8 @clear_lowbits8_ic2_load(ptr %w, i8 %numlowbits) nounwind {
 define i8 @clear_lowbits8_ic4_commutative(i8 %val, i8 %numlowbits) nounwind {
 ; X86-LABEL: clear_lowbits8_ic4_commutative:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $8, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrb %cl, %al
 ; X86-NEXT:    shlb %cl, %al
 ; X86-NEXT:    retl
@@ -855,9 +847,9 @@ define i8 @clear_lowbits8_ic4_commutative(i8 %val, i8 %numlowbits) nounwind {
 define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic0:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -865,11 +857,11 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_ic0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movb $16, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movb $16, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -901,9 +893,9 @@ define i16 @clear_lowbits16_ic0(i16 %val, i16 %numlowbits) nounwind {
 define i16 @clear_lowbits16_ic1_indexzext(i16 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic1_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -911,11 +903,11 @@ define i16 @clear_lowbits16_ic1_indexzext(i16 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_ic1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movb $16, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movb $16, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -948,10 +940,10 @@ define i16 @clear_lowbits16_ic1_indexzext(i16 %val, i8 %numlowbits) nounwind {
 define i16 @clear_lowbits16_ic2_load(ptr %w, i16 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -959,12 +951,12 @@ define i16 @clear_lowbits16_ic2_load(ptr %w, i16 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_ic2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzwl (%eax), %eax
-; X86-BMI2-NEXT:    movb $16, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movb $16, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzwl (%ecx), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -997,10 +989,10 @@ define i16 @clear_lowbits16_ic2_load(ptr %w, i16 %numlowbits) nounwind {
 define i16 @clear_lowbits16_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1008,12 +1000,12 @@ define i16 @clear_lowbits16_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_ic3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzwl (%eax), %eax
-; X86-BMI2-NEXT:    movb $16, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movb $16, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzwl (%ecx), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -1047,9 +1039,9 @@ define i16 @clear_lowbits16_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind
 define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits16_ic4_commutative:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movb $16, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1057,11 +1049,11 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind
 ;
 ; X86-BMI2-LABEL: clear_lowbits16_ic4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movb $16, %cl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movb $16, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -1095,9 +1087,9 @@ define i16 @clear_lowbits16_ic4_commutative(i16 %val, i16 %numlowbits) nounwind
 define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits32_ic0:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1136,9 +1128,9 @@ define i32 @clear_lowbits32_ic0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @clear_lowbits32_ic1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits32_ic1_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1189,11 +1181,11 @@ define i32 @clear_lowbits32_ic2_load(ptr %w, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: clear_lowbits32_ic2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits32_ic2_load:
@@ -1233,11 +1225,11 @@ define i32 @clear_lowbits32_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind
 ;
 ; X86-BMI2-LABEL: clear_lowbits32_ic3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %eax
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits32_ic3_load_indexzext:
@@ -1267,9 +1259,9 @@ define i32 @clear_lowbits32_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind
 define i32 @clear_lowbits32_ic4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits32_ic4_commutative:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
@@ -1424,8 +1416,6 @@ define i64 @clear_lowbits64_ic1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @clear_lowbits64_ic2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits64_ic2_load:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movb $64, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
@@ -1437,28 +1427,26 @@ define i64 @clear_lowbits64_ic2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB33_2:
-; X86-NOBMI2-NEXT:    andl (%esi), %eax
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    andl (%ecx), %eax
+; X86-NOBMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movb $64, %bl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB33_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB33_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    andl 4(%ecx), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic2_load:
@@ -1487,8 +1475,6 @@ define i64 @clear_lowbits64_ic2_load(ptr %w, i64 %numlowbits) nounwind {
 define i64 @clear_lowbits64_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI2-LABEL: clear_lowbits64_ic3_load_indexzext:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movb $64, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI2-NEXT:    movl $-1, %edx
@@ -1500,28 +1486,26 @@ define i64 @clear_lowbits64_ic3_load_indexzext(ptr %w, i8 %numlowbits) nounwind
 ; X86-NOBMI2-NEXT:    movl %eax, %edx
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:  .LBB34_2:
-; X86-NOBMI2-NEXT:    andl (%esi), %eax
-; X86-NOBMI2-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI2-NEXT:    popl %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    andl (%ecx), %eax
+; X86-NOBMI2-NEXT:    andl 4(%ecx), %edx
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: clear_lowbits64_ic3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movb $64, %bl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB34_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB34_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    andl 4(%ecx), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic3_load_indexzext:
@@ -1612,20 +1596,20 @@ define i64 @clear_lowbits64_ic4_commutative(i64 %val, i64 %numlowbits) nounwind
 define i32 @oneuse32_c(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI2-LABEL: oneuse32_c:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    movl $-1, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
-; X86-NOBMI2-NEXT:    movl %eax, (%edx)
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: oneuse32_c:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %eax, %edx, %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shlxl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
@@ -1656,53 +1640,52 @@ define i32 @oneuse32_c(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 define i64 @oneuse64(i64 %val, i64 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI2-LABEL: oneuse64:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    pushl %edi
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI2-NEXT:    movl $-1, %edx
-; X86-NOBMI2-NEXT:    movl $-1, %edi
-; X86-NOBMI2-NEXT:    shll %cl, %edi
+; X86-NOBMI2-NEXT:    movl $-1, %esi
+; X86-NOBMI2-NEXT:    shll %cl, %esi
 ; X86-NOBMI2-NEXT:    xorl %eax, %eax
 ; X86-NOBMI2-NEXT:    testb $32, %cl
-; X86-NOBMI2-NEXT:    jne .LBB37_1
-; X86-NOBMI2-NEXT:  # %bb.2:
-; X86-NOBMI2-NEXT:    movl %edi, %eax
-; X86-NOBMI2-NEXT:    jmp .LBB37_3
-; X86-NOBMI2-NEXT:  .LBB37_1:
-; X86-NOBMI2-NEXT:    movl %edi, %edx
-; X86-NOBMI2-NEXT:  .LBB37_3:
-; X86-NOBMI2-NEXT:    movl %edx, 4(%esi)
-; X86-NOBMI2-NEXT:    movl %eax, (%esi)
+; X86-NOBMI2-NEXT:    movl %esi, %edx
+; X86-NOBMI2-NEXT:    jne .LBB37_2
+; X86-NOBMI2-NEXT:  # %bb.1:
+; X86-NOBMI2-NEXT:    movl $-1, %edx
+; X86-NOBMI2-NEXT:  .LBB37_2:
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-NOBMI2-NEXT:    jne .LBB37_4
+; X86-NOBMI2-NEXT:  # %bb.3:
+; X86-NOBMI2-NEXT:    movl %esi, %eax
+; X86-NOBMI2-NEXT:  .LBB37_4:
+; X86-NOBMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    popl %esi
-; X86-NOBMI2-NEXT:    popl %edi
 ; X86-NOBMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: oneuse64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:    shlxl %edx, %eax, %ecx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
-; X86-BMI2-NEXT:    jne .LBB37_1
-; X86-BMI2-NEXT:  # %bb.2:
-; X86-BMI2-NEXT:    movl %esi, %eax
-; X86-BMI2-NEXT:    jmp .LBB37_3
-; X86-BMI2-NEXT:  .LBB37_1:
-; X86-BMI2-NEXT:    movl %esi, %edx
-; X86-BMI2-NEXT:  .LBB37_3:
-; X86-BMI2-NEXT:    movl %edx, 4(%ecx)
-; X86-BMI2-NEXT:    movl %eax, (%ecx)
+; X86-BMI2-NEXT:    testb $32, %dl
+; X86-BMI2-NEXT:    movl %ecx, %edx
+; X86-BMI2-NEXT:    jne .LBB37_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:  .LBB37_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl %edx, 4(%esi)
+; X86-BMI2-NEXT:    jne .LBB37_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl %ecx, %eax
+; X86-BMI2-NEXT:  .LBB37_4:
+; X86-BMI2-NEXT:    movl %eax, (%esi)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI2-LABEL: oneuse64:
diff --git a/llvm/test/CodeGen/X86/clmul-x86.ll b/llvm/test/CodeGen/X86/clmul-x86.ll
index cac2625f66fa1..d91aa843ca37f 100644
--- a/llvm/test/CodeGen/X86/clmul-x86.ll
+++ b/llvm/test/CodeGen/X86/clmul-x86.ll
@@ -32,49 +32,47 @@ define i64 @clmul_i64(i64 %a, i64 %b) nounwind {
 define i64 @clmulr_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: clmulr_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    bswapl %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NEXT:    shll $4, %edi
-; CHECK-NEXT:    shrl $4, %esi
-; CHECK-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; CHECK-NEXT:    orl %edi, %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; CHECK-NEXT:    shrl $2, %esi
-; CHECK-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; CHECK-NEXT:    leal (%esi,%edi,4), %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NEXT:    shrl %esi
-; CHECK-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; CHECK-NEXT:    leal (%esi,%edi,2), %esi
-; CHECK-NEXT:    movd %esi, %xmm1
-; CHECK-NEXT:    bswapl %edx
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; CHECK-NEXT:    shll $4, %esi
-; CHECK-NEXT:    shrl $4, %edx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; CHECK-NEXT:    orl %esi, %edx
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; CHECK-NEXT:    shrl $2, %edx
+; CHECK-NEXT:    shll $4, %edx
+; CHECK-NEXT:    shrl $4, %ecx
+; CHECK-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; CHECK-NEXT:    leal (%edx,%esi,4), %edx
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; CHECK-NEXT:    shrl %edx
+; CHECK-NEXT:    shrl $2, %ecx
+; CHECK-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; CHECK-NEXT:    leal (%ecx,%edx,4), %ecx
+; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; CHECK-NEXT:    leal (%edx,%esi,2), %edx
-; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    shrl %ecx
+; CHECK-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; CHECK-NEXT:    leal (%ecx,%edx,2), %ecx
+; CHECK-NEXT:    movd %ecx, %xmm1
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; CHECK-NEXT:    shll $4, %ecx
+; CHECK-NEXT:    shrl $4, %eax
+; CHECK-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; CHECK-NEXT:    leal (%eax,%ecx,4), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NEXT:    leal (%eax,%ecx,2), %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    bswapl %ecx
 ; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
@@ -150,8 +148,6 @@ define i64 @clmulr_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-NEXT:    shrl %eax
 ; CHECK-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; CHECK-NEXT:    leal (%eax,%ecx,2), %eax
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
   %a.ext = zext i64 %a to i128
   %b.ext = zext i64 %b to i128
diff --git a/llvm/test/CodeGen/X86/cmov-fp.ll b/llvm/test/CodeGen/X86/cmov-fp.ll
index 77665d083b7e3..e9a63965a4a6f 100644
--- a/llvm/test/CodeGen/X86/cmov-fp.ll
+++ b/llvm/test/CodeGen/X86/cmov-fp.ll
@@ -29,28 +29,26 @@ define double @test1(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test1:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovnbe %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test1:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    ja .LBB0_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB0_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ugt i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -81,28 +79,26 @@ define double @test2(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test2:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovnb %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test2:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jae .LBB1_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB1_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp uge i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -133,28 +129,26 @@ define double @test3(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test3:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovb %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test3:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jb .LBB2_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB2_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ult i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -185,28 +179,26 @@ define double @test4(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test4:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovbe %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test4:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jbe .LBB3_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB3_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ule i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -237,30 +229,28 @@ define double @test5(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test5:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setg %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test5:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jg .LBB4_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB4_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sgt i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -291,30 +281,28 @@ define double @test6(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test6:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setge %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test6:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jge .LBB5_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB5_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sge i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -345,30 +333,28 @@ define double @test7(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test7:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setl %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test7:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jl .LBB6_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB6_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp slt i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -399,30 +385,28 @@ define double @test8(i32 %a, i32 %b, double %x) nounwind {
 ;
 ; NOSSE-LABEL: test8:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setle %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test8:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jle .LBB7_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB7_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sle i32 %a, %b
   %sel = select i1 %cmp, double 99.0, double %x
@@ -466,28 +450,26 @@ define float @test9(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test9:
 ; NOSSE1:       # %bb.0:
+; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
 ; NOSSE1-NEXT:    fcmovnbe %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test9:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    ja .LBB8_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB8_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ugt i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -531,28 +513,26 @@ define float @test10(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test10:
 ; NOSSE1:       # %bb.0:
+; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
 ; NOSSE1-NEXT:    fcmovnb %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test10:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jae .LBB9_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB9_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp uge i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -596,28 +576,26 @@ define float @test11(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test11:
 ; NOSSE1:       # %bb.0:
+; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
 ; NOSSE1-NEXT:    fcmovb %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test11:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jb .LBB10_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB10_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ult i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -661,28 +639,26 @@ define float @test12(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test12:
 ; NOSSE1:       # %bb.0:
+; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
 ; NOSSE1-NEXT:    fcmovbe %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test12:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jbe .LBB11_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB11_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ule i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -726,30 +702,28 @@ define float @test13(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test13:
 ; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    setg %al
-; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
+; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    fcmovne %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test13:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jg .LBB12_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB12_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sgt i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -793,30 +767,28 @@ define float @test14(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test14:
 ; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    setge %al
-; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
+; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    fcmovne %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test14:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jge .LBB13_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB13_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sge i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -860,30 +832,28 @@ define float @test15(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test15:
 ; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    setl %al
-; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
+; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    fcmovne %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test15:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jl .LBB14_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB14_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp slt i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -927,30 +897,28 @@ define float @test16(i32 %a, i32 %b, float %x) nounwind {
 ;
 ; NOSSE1-LABEL: test16:
 ; NOSSE1:       # %bb.0:
-; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOSSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE1-NEXT:    setle %al
-; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE1-NEXT:    fxch %st(1)
+; NOSSE1-NEXT:    flds {{[0-9]+}}(%esp)
+; NOSSE1-NEXT:    testb %al, %al
 ; NOSSE1-NEXT:    fcmovne %st(1), %st
 ; NOSSE1-NEXT:    fstp %st(1)
 ; NOSSE1-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test16:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jle .LBB15_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB15_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sle i32 %a, %b
   %sel = select i1 %cmp, float 99.0, float %x
@@ -961,9 +929,8 @@ define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test17:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    fcmovnbe %st(1), %st
 ; SSE-NEXT:    fstp %st(1)
@@ -971,28 +938,26 @@ define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test17:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovnbe %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test17:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    ja .LBB16_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB16_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ugt i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
@@ -1003,9 +968,8 @@ define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test18:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    fcmovnb %st(1), %st
 ; SSE-NEXT:    fstp %st(1)
@@ -1013,28 +977,26 @@ define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test18:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovnb %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test18:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jae .LBB17_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB17_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp uge i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
@@ -1045,9 +1007,8 @@ define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test19:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    fcmovb %st(1), %st
 ; SSE-NEXT:    fstp %st(1)
@@ -1055,28 +1016,26 @@ define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test19:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovb %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test19:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jb .LBB18_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB18_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ult i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
@@ -1087,9 +1046,8 @@ define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test20:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    fcmovbe %st(1), %st
 ; SSE-NEXT:    fstp %st(1)
@@ -1097,28 +1055,26 @@ define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test20:
 ; NOSSE:       # %bb.0:
+; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
 ; NOSSE-NEXT:    fcmovbe %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test20:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jbe .LBB19_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB19_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp ule i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
@@ -1129,9 +1085,8 @@ define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test21:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    setg %al
 ; SSE-NEXT:    testb %al, %al
@@ -1141,30 +1096,28 @@ define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test21:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setg %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test21:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jg .LBB20_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB20_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
 ; We don't emit a branch for fp80, why?
   %cmp = icmp sgt i32 %a, %b
@@ -1176,9 +1129,8 @@ define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test22:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    setge %al
 ; SSE-NEXT:    testb %al, %al
@@ -1188,30 +1140,28 @@ define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test22:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setge %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test22:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jge .LBB21_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB21_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sge i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
@@ -1222,9 +1172,8 @@ define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test23:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    setl %al
 ; SSE-NEXT:    testb %al, %al
@@ -1234,30 +1183,28 @@ define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test23:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setl %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test23:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jl .LBB22_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB22_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp slt i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
@@ -1268,9 +1215,8 @@ define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ; SSE-LABEL: test24:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; SSE-NEXT:    fxch %st(1)
+; SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    setle %al
 ; SSE-NEXT:    testb %al, %al
@@ -1280,30 +1226,28 @@ define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
 ;
 ; NOSSE-LABEL: test24:
 ; NOSSE:       # %bb.0:
-; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOSSE-NEXT:    setle %al
-; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; NOSSE-NEXT:    fxch %st(1)
+; NOSSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; NOSSE-NEXT:    testb %al, %al
 ; NOSSE-NEXT:    fcmovne %st(1), %st
 ; NOSSE-NEXT:    fstp %st(1)
 ; NOSSE-NEXT:    retl
 ;
 ; NOCMOV-LABEL: test24:
 ; NOCMOV:       # %bb.0:
+; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; NOCMOV-NEXT:    jle .LBB23_2
 ; NOCMOV-NEXT:  # %bb.1:
-; NOCMOV-NEXT:    fstp %st(0)
+; NOCMOV-NEXT:    fstp %st(1)
 ; NOCMOV-NEXT:    fldz
-; NOCMOV-NEXT:    fxch %st(1)
 ; NOCMOV-NEXT:  .LBB23_2:
-; NOCMOV-NEXT:    fstp %st(1)
+; NOCMOV-NEXT:    fstp %st(0)
 ; NOCMOV-NEXT:    retl
   %cmp = icmp sle i32 %a, %b
   %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll
index d2d1c4db4608d..45b753d80fcab 100644
--- a/llvm/test/CodeGen/X86/cmovcmov.ll
+++ b/llvm/test/CodeGen/X86/cmovcmov.ll
@@ -23,16 +23,16 @@ define dso_local i32 @test_select_fcmp_oeq_i32(float %a, float %b, i32 %c, i32 %
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; NOCMOV-NEXT:    sahf
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    jne .LBB0_3
 ; NOCMOV-NEXT:  # %bb.1: # %entry
 ; NOCMOV-NEXT:    jp .LBB0_3
 ; NOCMOV-NEXT:  # %bb.2: # %entry
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:  .LBB0_3: # %entry
-; NOCMOV-NEXT:    movl (%eax), %eax
+; NOCMOV-NEXT:    movl (%ecx), %eax
 ; NOCMOV-NEXT:    retl
 entry:
   %cmp = fcmp oeq float %a, %b
@@ -55,9 +55,9 @@ define i64 @test_select_fcmp_oeq_i64(float %a, float %b, i64 %c, i64 %d) nounwin
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; NOCMOV-NEXT:    sahf
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    jne .LBB1_3
 ; NOCMOV-NEXT:  # %bb.1: # %entry
 ; NOCMOV-NEXT:    jp .LBB1_3
@@ -88,9 +88,9 @@ define i64 @test_select_fcmp_une_i64(float %a, float %b, i64 %c, i64 %d) nounwin
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; NOCMOV-NEXT:    sahf
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    jne .LBB2_3
 ; NOCMOV-NEXT:  # %bb.1: # %entry
 ; NOCMOV-NEXT:    jp .LBB2_3
@@ -125,16 +125,16 @@ define dso_local double @test_select_fcmp_oeq_f64(float %a, float %b, double %c,
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; NOCMOV-NEXT:    sahf
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    jne .LBB3_3
 ; NOCMOV-NEXT:  # %bb.1: # %entry
 ; NOCMOV-NEXT:    jp .LBB3_3
 ; NOCMOV-NEXT:  # %bb.2: # %entry
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:  .LBB3_3: # %entry
-; NOCMOV-NEXT:    fldl (%eax)
+; NOCMOV-NEXT:    fldl (%ecx)
 ; NOCMOV-NEXT:    retl
 entry:
   %cmp = fcmp oeq float %a, %b
@@ -157,53 +157,49 @@ define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <
 ;
 ; NOCMOV-LABEL: test_select_fcmp_oeq_v4i32:
 ; NOCMOV:       # %bb.0: # %entry
-; NOCMOV-NEXT:    pushl %edi
-; NOCMOV-NEXT:    pushl %esi
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; NOCMOV-NEXT:    sahf
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    jne .LBB4_3
 ; NOCMOV-NEXT:  # %bb.1: # %entry
 ; NOCMOV-NEXT:    jp .LBB4_3
 ; NOCMOV-NEXT:  # %bb.2: # %entry
 ; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:  .LBB4_3: # %entry
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    movl (%ecx), %ecx
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOCMOV-NEXT:    movl %ecx, 12(%eax)
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    jne .LBB4_6
 ; NOCMOV-NEXT:  # %bb.4: # %entry
 ; NOCMOV-NEXT:    jp .LBB4_6
 ; NOCMOV-NEXT:  # %bb.5: # %entry
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:  .LBB4_6: # %entry
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; NOCMOV-NEXT:    movl (%ecx), %ecx
+; NOCMOV-NEXT:    movl %ecx, 8(%eax)
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    jne .LBB4_9
 ; NOCMOV-NEXT:  # %bb.7: # %entry
 ; NOCMOV-NEXT:    jp .LBB4_9
 ; NOCMOV-NEXT:  # %bb.8: # %entry
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:  .LBB4_9: # %entry
 ; NOCMOV-NEXT:    movl (%ecx), %ecx
-; NOCMOV-NEXT:    movl (%edx), %edx
-; NOCMOV-NEXT:    movl (%esi), %esi
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; NOCMOV-NEXT:    movl %ecx, 4(%eax)
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    jne .LBB4_12
 ; NOCMOV-NEXT:  # %bb.10: # %entry
 ; NOCMOV-NEXT:    jp .LBB4_12
 ; NOCMOV-NEXT:  # %bb.11: # %entry
-; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; NOCMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:  .LBB4_12: # %entry
-; NOCMOV-NEXT:    movl (%edi), %edi
-; NOCMOV-NEXT:    movl %edi, 12(%eax)
-; NOCMOV-NEXT:    movl %esi, 8(%eax)
-; NOCMOV-NEXT:    movl %edx, 4(%eax)
+; NOCMOV-NEXT:    movl (%ecx), %ecx
 ; NOCMOV-NEXT:    movl %ecx, (%eax)
-; NOCMOV-NEXT:    popl %esi
-; NOCMOV-NEXT:    popl %edi
 ; NOCMOV-NEXT:    retl $4
 entry:
   %cmp = fcmp oeq float %a, %b
@@ -227,10 +223,10 @@ define dso_local float @test_zext_fcmp_une(float %a, float %b) nounwind {
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
-; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
-; NOCMOV-NEXT:    sahf
 ; NOCMOV-NEXT:    fld1
 ; NOCMOV-NEXT:    fldz
+; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; NOCMOV-NEXT:    sahf
 ; NOCMOV-NEXT:    jne .LBB5_1
 ; NOCMOV-NEXT:  # %bb.2: # %entry
 ; NOCMOV-NEXT:    jp .LBB5_5
@@ -265,10 +261,10 @@ define dso_local float @test_zext_fcmp_oeq(float %a, float %b) nounwind {
 ; NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; NOCMOV-NEXT:    fucompp
 ; NOCMOV-NEXT:    fnstsw %ax
-; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
-; NOCMOV-NEXT:    sahf
 ; NOCMOV-NEXT:    fldz
 ; NOCMOV-NEXT:    fld1
+; NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; NOCMOV-NEXT:    sahf
 ; NOCMOV-NEXT:    jne .LBB6_1
 ; NOCMOV-NEXT:  # %bb.2: # %entry
 ; NOCMOV-NEXT:    jp .LBB6_5
@@ -325,9 +321,9 @@ define dso_local void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) nounwi
 ;
 ; NOCMOV-LABEL: no_cascade_opt:
 ; NOCMOV:       # %bb.0: # %entry
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movb $20, %al
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; NOCMOV-NEXT:    movb $20, %cl
 ; NOCMOV-NEXT:    jge .LBB7_1
 ; NOCMOV-NEXT:  # %bb.2: # %entry
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index ff6ee68074088..4d757475e193f 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -446,9 +446,9 @@ define i1 @cmp16_load_ne_load(ptr %p0, ptr %p1) {
 ; X86-GENERIC-LABEL: cmp16_load_ne_load:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GENERIC-NEXT:    movzwl (%eax), %eax
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-GENERIC-NEXT:    movzwl (%ecx), %ecx
-; X86-GENERIC-NEXT:    cmpw (%eax), %cx
+; X86-GENERIC-NEXT:    cmpw (%ecx), %ax
 ; X86-GENERIC-NEXT:    setne %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -462,9 +462,9 @@ define i1 @cmp16_load_ne_load(ptr %p0, ptr %p1) {
 ; X86-FAST-LABEL: cmp16_load_ne_load:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movzwl (%eax), %eax
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movzwl (%ecx), %ecx
-; X86-FAST-NEXT:    cmpw (%eax), %cx
+; X86-FAST-NEXT:    cmpw (%ecx), %ax
 ; X86-FAST-NEXT:    setne %al
 ; X86-FAST-NEXT:    retl
 ;
@@ -477,10 +477,10 @@ define i1 @cmp16_load_ne_load(ptr %p0, ptr %p1) {
 ;
 ; X86-ATOM-LABEL: cmp16_load_ne_load:
 ; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-ATOM-NEXT:    movzwl (%ecx), %ecx
-; X86-ATOM-NEXT:    cmpw (%eax), %cx
+; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ATOM-NEXT:    movzwl (%eax), %eax
+; X86-ATOM-NEXT:    cmpw (%ecx), %ax
 ; X86-ATOM-NEXT:    setne %al
 ; X86-ATOM-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll
index e58295fff9855..75dae745be2d9 100644
--- a/llvm/test/CodeGen/X86/cmpf-avx.ll
+++ b/llvm/test/CodeGen/X86/cmpf-avx.ll
@@ -43,8 +43,8 @@ define <8 x i32> @cmp_ne_sitofp(<8 x i32> %x) {
 define <8 x i32> @cmp_slt_fail_no_const(<8 x i32> %x, <8 x i32> %y) {
 ; X86-LABEL: cmp_slt_fail_no_const:
 ; X86:       # %bb.0:
-; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm2
 ; X86-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
@@ -95,17 +95,29 @@ define <8 x i32> @cmp_sgt_fail_no_bounds(<8 x i32> %x, <8 x i32> %y) {
 }
 
 define <8 x i32> @cmp_sgt_bitcast(<8 x i32> %xx, <8 x i32> %yy) {
-; CHECK-LABEL: cmp_sgt_bitcast:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
-; CHECK-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; CHECK-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: cmp_sgt_bitcast:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
+; X86-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmp_sgt_bitcast:
+; X64:       # %bb.0:
+; X64-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
+; X64-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
+; X64-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT:    retq
   %x = and <8 x i32> %xx, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
   %y = and <8 x i32> %yy, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
 
diff --git a/llvm/test/CodeGen/X86/cmpxchg8b.ll b/llvm/test/CodeGen/X86/cmpxchg8b.ll
index 10e957015047b..f16fff3a7c722 100644
--- a/llvm/test/CodeGen/X86/cmpxchg8b.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg8b.ll
@@ -34,16 +34,16 @@ define void @t1(ptr nocapture %p) nounwind ssp {
 ; I486-NEXT:    movl %esp, %ebp
 ; I486-NEXT:    andl $-8, %esp
 ; I486-NEXT:    subl $8, %esp
-; I486-NEXT:    movl 8(%ebp), %eax
 ; I486-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; I486-NEXT:    movl $0, (%esp)
-; I486-NEXT:    movl %esp, %ecx
+; I486-NEXT:    movl %esp, %eax
+; I486-NEXT:    movl 8(%ebp), %ecx
 ; I486-NEXT:    pushl $5
 ; I486-NEXT:    pushl $5
 ; I486-NEXT:    pushl $0
 ; I486-NEXT:    pushl $1
-; I486-NEXT:    pushl %ecx
 ; I486-NEXT:    pushl %eax
+; I486-NEXT:    pushl %ecx
 ; I486-NEXT:    calll __atomic_compare_exchange_8 at PLT
 ; I486-NEXT:    addl $24, %esp
 ; I486-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/coalescer-commute1.ll b/llvm/test/CodeGen/X86/coalescer-commute1.ll
index f4decb7e2e0c5..b45aecf4539da 100644
--- a/llvm/test/CodeGen/X86/coalescer-commute1.ll
+++ b/llvm/test/CodeGen/X86/coalescer-commute1.ll
@@ -8,18 +8,18 @@
 define void @runcont(ptr %source) nounwind  {
 ; CHECK-LABEL: runcont:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl L_NNTOT$non_lazy_ptr, %ecx
-; CHECK-NEXT:    movl (%ecx), %ecx
+; CHECK-NEXT:    movl L_NNTOT$non_lazy_ptr, %eax
+; CHECK-NEXT:    movl (%eax), %eax
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_1: ## %bb
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vcvtsi2ssl (%eax,%edx,4), %xmm7, %xmm1
+; CHECK-NEXT:    vcvtsi2ssl (%edx,%ecx,4), %xmm7, %xmm1
 ; CHECK-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    incl %edx
-; CHECK-NEXT:    cmpl %edx, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl %ecx, %eax
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.2: ## %bb13
 ; CHECK-NEXT:    movl L_G$non_lazy_ptr, %eax
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index cb934c8664e45..f3e10244941e2 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -92,21 +92,19 @@ define i32 @adc_merge_constants(i32 %a0) nounwind {
 define i32 @adc_merge_sub(i32 %a0) nounwind {
 ; X86-LABEL: adc_merge_sub:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    addl $42, %edi
+; X86-NEXT:    addl $42, %ecx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
+; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
-; X86-NEXT:    xorl %edi, %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: adc_merge_sub:
@@ -171,10 +169,10 @@ define i32 @adc_add(i32 %0, i32 %1, i32 %2, i32 %3) nounwind {
 define i32 @adc_add_wrong_flags(i32 %0, i32 %1, i32 %2, i32 %3) nounwind {
 ; X86-LABEL: adc_add_wrong_flags:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jb .LBB5_2
@@ -204,26 +202,22 @@ define i32 @adc_add_wrong_flags(i32 %0, i32 %1, i32 %2, i32 %3) nounwind {
 define i32 @adc_add_multi_use(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, ptr %5) nounwind {
 ; X86-LABEL: adc_add_multi_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    leal (%esi,%edx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    leal (%edi,%esi), %ebx
 ; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    movl %ebx, (%edx)
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    js .LBB6_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB6_2:
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: adc_add_multi_use:
diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll
index 1f074c877f3ae..80ccace37bd96 100644
--- a/llvm/test/CodeGen/X86/combine-bswap.ll
+++ b/llvm/test/CodeGen/X86/combine-bswap.ll
@@ -175,9 +175,9 @@ define void @demand_one_loaded_byte(ptr %xp, ptr %yp) {
 ; X86-LABEL: demand_one_loaded_byte:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: demand_one_loaded_byte:
@@ -324,13 +324,13 @@ define i64 @bs_xor_rhs_bs64(i64 %a, i64 %b) #0 {
 define i32 @bs_and_all_operand_multiuse(i32 %a, i32 %b) #0 {
 ; X86-LABEL: bs_and_all_operand_multiuse:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl %eax, %edx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -414,8 +414,8 @@ define i64 @test_bswap64_shift17(i64 %a0) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shldl $17, %edx, %eax
-; X86-NEXT:    shll $17, %edx
 ; X86-NEXT:    bswapl %eax
+; X86-NEXT:    shll $17, %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    retl
 ;
@@ -434,9 +434,9 @@ define i64 @test_bswap64_shift17(i64 %a0) {
 define i64 @test_bswap64_shift48_multiuse(i64 %a0, ptr %a1) {
 ; X86-LABEL: test_bswap64_shift48_multiuse:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, (%ecx)
diff --git a/llvm/test/CodeGen/X86/combine-fneg.ll b/llvm/test/CodeGen/X86/combine-fneg.ll
index 8ca7fb81563fa..1e776fa0e1ffa 100644
--- a/llvm/test/CodeGen/X86/combine-fneg.ll
+++ b/llvm/test/CodeGen/X86/combine-fneg.ll
@@ -227,10 +227,10 @@ define void @fneg_int_bfloat(ptr %src, ptr %dst) nounwind {
 ; X86-SSE-LABEL: fneg_int_bfloat:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movzwl (%eax), %eax
+; X86-SSE-NEXT:    xorl $32768, %eax # imm = 0x8000
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movzwl (%ecx), %ecx
-; X86-SSE-NEXT:    xorl $32768, %ecx # imm = 0x8000
-; X86-SSE-NEXT:    movw %cx, (%eax)
+; X86-SSE-NEXT:    movw %ax, (%ecx)
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: fneg_int_bfloat:
@@ -266,18 +266,18 @@ define void @fneg_int_f64(ptr %src, ptr %dst) {
 ; X86-SSE1-LABEL: fneg_int_f64:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    fldl (%ecx)
+; X86-SSE1-NEXT:    fldl (%eax)
 ; X86-SSE1-NEXT:    fchs
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    fstpl (%eax)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE2-LABEL: fneg_int_f64:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE2-NEXT:    retl
 ;
@@ -298,9 +298,9 @@ define void @fneg_int_v4f32(ptr %src, ptr %dst) {
 ; X86-SSE-LABEL: fneg_int_v4f32:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
 ; X86-SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll
index 202c94d7aacab..d1179eae85f9e 100644
--- a/llvm/test/CodeGen/X86/combine-multiplies.ll
+++ b/llvm/test/CodeGen/X86/combine-multiplies.ll
@@ -36,12 +36,12 @@ define void @testCombineMultiplies(ptr nocapture %a, i32 %lll) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
-; CHECK-NEXT:    leal (%edx,%eax), %esi
-; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
-; CHECK-NEXT:    movl $22, 2080(%edx,%eax)
-; CHECK-NEXT:    movl $33, 10080(%edx,%eax)
+; CHECK-NEXT:    imull $400, %eax, %ecx # imm = 0x190
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    leal (%ecx,%edx), %esi
+; CHECK-NEXT:    movl $11, 2020(%esi,%eax,4)
+; CHECK-NEXT:    movl $33, 10080(%ecx,%edx)
+; CHECK-NEXT:    movl $22, 2080(%ecx,%edx)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 entry:
@@ -107,19 +107,19 @@ define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,11,11,11]
 ; CHECK-NEXT:    paddd %xmm0, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [22,22,22,22]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pmuludq %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm1, x
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [22,22,22,22]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pmuludq %xmm1, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    pmuludq %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,242,242,242]
-; CHECK-NEXT:    paddd %xmm0, %xmm2
+; CHECK-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [726,726,726,726]
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, v3
 ; CHECK-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, v2
-; CHECK-NEXT:    movdqa %xmm0, v3
-; CHECK-NEXT:    movdqa %xmm1, x
+; CHECK-NEXT:    movdqa %xmm0, v2
 ; CHECK-NEXT:    retl
 entry:
   %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11>
@@ -141,18 +141,18 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
 ; CHECK-NEXT:    paddd %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    movdqa %xmm1, x
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [22,33,44,55]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [33,33,55,55]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
-; CHECK-NEXT:    paddd %xmm0, %xmm2
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [33,33,55,55]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [726,1452,2420,3630]
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, v3
 ; CHECK-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, v2
-; CHECK-NEXT:    movdqa %xmm0, v3
-; CHECK-NEXT:    movdqa %xmm1, x
+; CHECK-NEXT:    movdqa %xmm0, v2
 ; CHECK-NEXT:    retl
 entry:
   %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44>
diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index 62744d4f3050a..ef2332d58fc11 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -10,13 +10,13 @@ define void @PR25858_i32(ptr sret(%WideUInt32), ptr, ptr) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %edx
-; X86-NEXT:    subl (%ecx), %esi
-; X86-NEXT:    sbbl 4(%ecx), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl (%esi), %edx
+; X86-NEXT:    sbbl 4(%esi), %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
 ;
@@ -60,21 +60,21 @@ define void @PR25858_i64(ptr sret(%WideUInt64), ptr, ptr) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl 8(%esi), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl (%edi), %edx
-; X86-NEXT:    movl 4(%edi), %esi
-; X86-NEXT:    movl 12(%edi), %ecx
-; X86-NEXT:    movl 8(%edi), %edi
-; X86-NEXT:    subl 8(%ebx), %edi
-; X86-NEXT:    sbbl 12(%ebx), %ecx
-; X86-NEXT:    subl (%ebx), %edx
-; X86-NEXT:    sbbl 4(%ebx), %esi
-; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    subl 8(%edi), %edx
+; X86-NEXT:    sbbl 12(%edi), %ecx
+; X86-NEXT:    movl (%esi), %ebx
+; X86-NEXT:    movl 4(%esi), %esi
+; X86-NEXT:    subl (%edi), %ebx
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl 4(%edi), %esi
 ; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -368,21 +368,19 @@ define i32 @sbb_merge_add1(i32 %a0) nounwind {
 define i32 @sbb_merge_add2(i32 %a0) nounwind {
 ; X86-LABEL: sbb_merge_add2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl $42, %edi
+; X86-NEXT:    movl $42, %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
+; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $4, %esp
-; X86-NEXT:    xorl %edi, %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sbb_merge_add2:
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
index 7855ff2c7eb83..e3533713d471a 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -59,9 +59,9 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
 ; CHECK32-NEXT:    pushl %ebx # encoding: [0x53]
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK32-NEXT:    .cfi_offset %ebx, -8
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; CHECK32-NEXT:    #APP
 ; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
 ; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
 ; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2, kind: FK_PCRel_1
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 2859a87db3d56..2f55529a56f58 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -59,9 +59,9 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
 ; CHECK32-NEXT:    pushl %ebx # encoding: [0x53]
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK32-NEXT:    .cfi_offset %ebx, -8
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; CHECK32-NEXT:    #APP
 ; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
 ; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
 ; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2, kind: FK_PCRel_1
@@ -320,8 +320,8 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
 ; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_13, kind: FK_PCRel_1
 ; CHECK32-NEXT:  .LBB3_11: # %for.inc
 ; CHECK32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; CHECK32-NEXT:    incl %eax # encoding: [0x40]
 ; CHECK32-NEXT:    decl %edx # encoding: [0x4a]
+; CHECK32-NEXT:    incl %eax # encoding: [0x40]
 ; CHECK32-NEXT:    jmp .LBB3_1 # encoding: [0xeb,A]
 ; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_1, kind: FK_PCRel_1
 ; CHECK32-NEXT:  .LBB3_14:
diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
index 142ac754c3f7e..7a417f277e23c 100644
--- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
@@ -1615,10 +1615,10 @@ define i64 @test_i64_140737488289792_mask_lshr_15(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_lshr_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $16, %ecx, %eax
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl $16, %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1635,10 +1635,10 @@ define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_lshr_17(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_lshr_17:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $15, %ecx, %eax
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl $17, %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1655,10 +1655,10 @@ define i64 @test_i64_140737488289792_mask_lshr_17(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_lshr_18(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_lshr_18:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $14, %ecx, %eax
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl $18, %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1789,10 +1789,10 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_ashr_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $16, %ecx, %eax
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl $16, %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1809,10 +1809,10 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_ashr_17(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_ashr_17:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $15, %ecx, %eax
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl $17, %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1829,10 +1829,10 @@ define i64 @test_i64_140737488289792_mask_ashr_17(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_ashr_18(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_ashr_18:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $14, %ecx, %eax
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl $18, %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1995,9 +1995,9 @@ define i64 @test_i64_2147483647_mask_shl_34(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_shl_15:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shldl $15, %eax, %edx
 ; X86-NEXT:    andl $65536, %eax # imm = 0x10000
 ; X86-NEXT:    shll $15, %eax
@@ -2016,10 +2016,10 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 ; X86-LABEL: test_i64_140737488289792_mask_shl_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $32767, %edx # imm = 0x7FFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $16, %eax, %edx
+; X86-NEXT:    movl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrdl $16, %eax, %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll
index e1711ccdbe13f..9b8ee1489e51e 100644
--- a/llvm/test/CodeGen/X86/copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -25,11 +25,10 @@ define dso_local i32 @test1() nounwind {
 ; X32-NEXT:    incl c
 ; X32-NEXT:    sete %dl
 ; X32-NEXT:    movb a, %ah
-; X32-NEXT:    movb %ah, %ch
-; X32-NEXT:    incb %ch
 ; X32-NEXT:    cmpb %cl, %ah
 ; X32-NEXT:    sete d
-; X32-NEXT:    movb %ch, a
+; X32-NEXT:    incb %ah
+; X32-NEXT:    movb %ah, a
 ; X32-NEXT:    testb %dl, %dl
 ; X32-NEXT:    jne .LBB0_2
 ; X32-NEXT:  # %bb.1: # %if.then
@@ -293,30 +292,24 @@ bb1:
 define dso_local void @PR37431(ptr %arg1, ptr %arg2, ptr %arg3, i32 %arg4, i64 %arg5) nounwind {
 ; X32-LABEL: PR37431:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    sarl $31, %edx
+; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl (%edi), %edi
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    sarl $31, %ebp
-; X32-NEXT:    xorl %ebx, %ebx
-; X32-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    sbbl %ebp, %esi
-; X32-NEXT:    sbbl %ebx, %ebx
-; X32-NEXT:    movb %bl, (%edx)
+; X32-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    sbbl %edx, %esi
+; X32-NEXT:    sbbl %ecx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movb %cl, (%eax)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    cltd
-; X32-NEXT:    idivl %ebx
-; X32-NEXT:    movb %dl, (%ecx)
+; X32-NEXT:    idivl %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movb %dl, (%eax)
 ; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: PR37431:
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index c5aa2a9f40239..f2b389afd1bbe 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -34,9 +34,9 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    notb %al
 ; X86-CMOV-NEXT:    movzbl %al, %eax
-; X86-CMOV-NEXT:    bsrl %eax, %ecx
-; X86-CMOV-NEXT:    movl $15, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $15, %ecx
+; X86-CMOV-NEXT:    bsrl %eax, %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl $7, %eax
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
@@ -135,9 +135,9 @@ define i16 @ctlo_i16(i16 %x) {
 ; X86-CMOV:       # %bb.0:
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    notl %eax
-; X86-CMOV-NEXT:    bsrw %ax, %cx
-; X86-CMOV-NEXT:    movw $31, %ax
-; X86-CMOV-NEXT:    cmovnew %cx, %ax
+; X86-CMOV-NEXT:    movw $31, %cx
+; X86-CMOV-NEXT:    bsrw %ax, %ax
+; X86-CMOV-NEXT:    cmovew %cx, %ax
 ; X86-CMOV-NEXT:    xorl $15, %eax
 ; X86-CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-CMOV-NEXT:    retl
@@ -221,9 +221,9 @@ define i32 @ctlo_i32(i32 %x) {
 ; X86-CMOV:       # %bb.0:
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    notl %eax
-; X86-CMOV-NEXT:    bsrl %eax, %ecx
-; X86-CMOV-NEXT:    movl $63, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %ecx
+; X86-CMOV-NEXT:    bsrl %eax, %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl $31, %eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -289,17 +289,17 @@ define i64 @ctlo_i64(i64 %x) nounwind {
 ; X86-NOCMOV-LABEL: ctlo_i64:
 ; X86-NOCMOV:       # %bb.0:
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOCMOV-NEXT:    notl %eax
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOCMOV-NEXT:    notl %ecx
-; X86-NOCMOV-NEXT:    notl %eax
-; X86-NOCMOV-NEXT:    movl %eax, %edx
-; X86-NOCMOV-NEXT:    orl %ecx, %edx
+; X86-NOCMOV-NEXT:    movl %ecx, %edx
+; X86-NOCMOV-NEXT:    orl %eax, %edx
 ; X86-NOCMOV-NEXT:    je .LBB6_1
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
-; X86-NOCMOV-NEXT:    testl %ecx, %ecx
+; X86-NOCMOV-NEXT:    testl %eax, %eax
 ; X86-NOCMOV-NEXT:    jne .LBB6_3
 ; X86-NOCMOV-NEXT:  # %bb.4: # %cond.false
-; X86-NOCMOV-NEXT:    bsrl %eax, %eax
+; X86-NOCMOV-NEXT:    bsrl %ecx, %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    orl $32, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
@@ -309,27 +309,29 @@ define i64 @ctlo_i64(i64 %x) nounwind {
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
 ; X86-NOCMOV-NEXT:    retl
 ; X86-NOCMOV-NEXT:  .LBB6_3:
-; X86-NOCMOV-NEXT:    bsrl %ecx, %eax
+; X86-NOCMOV-NEXT:    bsrl %eax, %eax
 ; X86-NOCMOV-NEXT:    xorl $31, %eax
 ; X86-NOCMOV-NEXT:    xorl %edx, %edx
 ; X86-NOCMOV-NEXT:    retl
 ;
 ; X86-CMOV-LABEL: ctlo_i64:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %esi
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    notl %ecx
 ; X86-CMOV-NEXT:    notl %eax
-; X86-CMOV-NEXT:    bsrl %eax, %eax
 ; X86-CMOV-NEXT:    movl $63, %edx
-; X86-CMOV-NEXT:    cmovnel %eax, %edx
-; X86-CMOV-NEXT:    xorl $31, %edx
-; X86-CMOV-NEXT:    addl $32, %edx
+; X86-CMOV-NEXT:    bsrl %eax, %esi
+; X86-CMOV-NEXT:    cmovel %edx, %esi
+; X86-CMOV-NEXT:    xorl $31, %esi
+; X86-CMOV-NEXT:    addl $32, %esi
+; X86-CMOV-NEXT:    notl %ecx
 ; X86-CMOV-NEXT:    bsrl %ecx, %eax
 ; X86-CMOV-NEXT:    xorl $31, %eax
 ; X86-CMOV-NEXT:    testl %ecx, %ecx
-; X86-CMOV-NEXT:    cmovel %edx, %eax
+; X86-CMOV-NEXT:    cmovel %esi, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
+; X86-CMOV-NEXT:    popl %esi
 ; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlo_i64:
@@ -393,10 +395,10 @@ define i64 @ctlo_i64_undef(i64 %x) {
 ; X86-CMOV:       # %bb.0:
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    notl %eax
 ; X86-CMOV-NEXT:    notl %ecx
 ; X86-CMOV-NEXT:    bsrl %ecx, %edx
 ; X86-CMOV-NEXT:    xorl $31, %edx
+; X86-CMOV-NEXT:    notl %eax
 ; X86-CMOV-NEXT:    bsrl %eax, %eax
 ; X86-CMOV-NEXT:    xorl $31, %eax
 ; X86-CMOV-NEXT:    orl $32, %eax
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 789c681174efb..0e4098da1be68 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -236,10 +236,10 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ;
 ; X86-CMOV-LABEL: ctlz_i8_zero_test:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl $15, %ecx
 ; X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    bsrl %eax, %ecx
-; X86-CMOV-NEXT:    movl $15, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    bsrl %eax, %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl $7, %eax
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
@@ -307,9 +307,9 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
 ;
 ; X86-CMOV-LABEL: ctlz_i16_zero_test:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsrw {{[0-9]+}}(%esp), %cx
-; X86-CMOV-NEXT:    movw $31, %ax
-; X86-CMOV-NEXT:    cmovnew %cx, %ax
+; X86-CMOV-NEXT:    movw $31, %cx
+; X86-CMOV-NEXT:    bsrw {{[0-9]+}}(%esp), %ax
+; X86-CMOV-NEXT:    cmovew %cx, %ax
 ; X86-CMOV-NEXT:    xorl $15, %eax
 ; X86-CMOV-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-CMOV-NEXT:    retl
@@ -362,9 +362,9 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
 ;
 ; X86-CMOV-LABEL: ctlz_i32_zero_test:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $63, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %ecx
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl $31, %eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -428,16 +428,16 @@ define i64 @ctlz_i64_zero_test(i64 %n) nounwind {
 ;
 ; X86-CMOV-LABEL: ctlz_i64_zero_test:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    movl $63, %edx
-; X86-CMOV-NEXT:    cmovnel %eax, %edx
-; X86-CMOV-NEXT:    xorl $31, %edx
-; X86-CMOV-NEXT:    addl $32, %edx
-; X86-CMOV-NEXT:    bsrl %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %eax
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    cmovel %eax, %ecx
+; X86-CMOV-NEXT:    xorl $31, %ecx
+; X86-CMOV-NEXT:    addl $32, %ecx
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-CMOV-NEXT:    bsrl %edx, %eax
 ; X86-CMOV-NEXT:    xorl $31, %eax
-; X86-CMOV-NEXT:    testl %ecx, %ecx
-; X86-CMOV-NEXT:    cmovel %edx, %eax
+; X86-CMOV-NEXT:    testl %edx, %edx
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
 ; X86-CMOV-NEXT:    retl
 ;
@@ -606,9 +606,9 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
 ;
 ; X86-CMOV-LABEL: ctlz_bsr_zero_test:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $63, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %ecx
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_bsr_zero_test:
@@ -735,9 +735,9 @@ define i64 @ctlz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-CMOV:       # %bb.0:
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    orl $1, %eax
 ; X86-CMOV-NEXT:    bsrl %ecx, %edx
 ; X86-CMOV-NEXT:    xorl $31, %edx
+; X86-CMOV-NEXT:    orl $1, %eax
 ; X86-CMOV-NEXT:    bsrl %eax, %eax
 ; X86-CMOV-NEXT:    xorl $31, %eax
 ; X86-CMOV-NEXT:    orl $32, %eax
@@ -854,9 +854,9 @@ define i8 @PR47603_trunc(i32 %0) {
 define i32 @PR47603_zext(i32 %a0, ptr %a1) {
 ; X86-LABEL: PR47603_zext:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movsbl (%eax,%ecx), %eax
+; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movsbl (%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR47603_zext:
@@ -867,9 +867,9 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) {
 ;
 ; X86-CLZ-LABEL: PR47603_zext:
 ; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-CLZ-NEXT:    movsbl (%eax,%ecx), %eax
+; X86-CLZ-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-CLZ-NEXT:    movsbl (%ecx,%eax), %eax
 ; X86-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: PR47603_zext:
@@ -888,10 +888,10 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) {
 ;
 ; X86-FASTLZCNT-LABEL: PR47603_zext:
 ; X86-FASTLZCNT:       # %bb.0:
-; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FASTLZCNT-NEXT:    lzcntl {{[0-9]+}}(%esp), %ecx
-; X86-FASTLZCNT-NEXT:    xorl $31, %ecx
-; X86-FASTLZCNT-NEXT:    movsbl (%eax,%ecx), %eax
+; X86-FASTLZCNT-NEXT:    lzcntl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT:    xorl $31, %eax
+; X86-FASTLZCNT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FASTLZCNT-NEXT:    movsbl (%ecx,%eax), %eax
 ; X86-FASTLZCNT-NEXT:    retl
   %ctlz = tail call i32 @llvm.ctlz.i32(i32 %a0, i1 true)
   %xor = xor i32 %ctlz, 31
@@ -973,10 +973,10 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ;
 ; X86-CMOV-LABEL: ctlz_xor7_i8_false:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    movl $15, %ecx
 ; X86-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    bsrl %eax, %ecx
-; X86-CMOV-NEXT:    movl $15, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    bsrl %eax, %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -1085,9 +1085,9 @@ define i32 @ctlz_xor31_i32_false(i32 %x) {
 ;
 ; X86-CMOV-LABEL: ctlz_xor31_i32_false:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $63, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %ecx
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: ctlz_xor31_i32_false:
@@ -1228,9 +1228,9 @@ define i64 @ctlz_i32_sext(i32 %x) {
 ;
 ; X86-CMOV-LABEL: ctlz_i32_sext:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $63, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %ecx
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
 ; X86-CMOV-NEXT:    retl
 ;
@@ -1290,9 +1290,9 @@ define i64 @ctlz_i32_zext(i32 %x) {
 ;
 ; X86-CMOV-LABEL: ctlz_i32_zext:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $63, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $63, %ecx
+; X86-CMOV-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
 ; X86-CMOV-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index a88fb96dd7c8c..d44a7b0e93790 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -317,9 +317,9 @@ define i32 @cttz_i32_zero_test(i32 %n) {
 ;
 ; X86-CMOV-LABEL: cttz_i32_zero_test:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $32, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $32, %ecx
+; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    retl
 ;
 ; X64-LABEL: cttz_i32_zero_test:
@@ -379,13 +379,13 @@ define i64 @cttz_i64_zero_test(i64 %n) nounwind {
 ;
 ; X86-CMOV-LABEL: cttz_i64_zero_test:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    movl $32, %eax
 ; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $32, %edx
-; X86-CMOV-NEXT:    cmovnel %ecx, %edx
-; X86-CMOV-NEXT:    addl $32, %edx
+; X86-CMOV-NEXT:    cmovel %eax, %ecx
+; X86-CMOV-NEXT:    addl $32, %ecx
+; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    bsfl %eax, %eax
-; X86-CMOV-NEXT:    cmovel %edx, %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
 ; X86-CMOV-NEXT:    retl
 ;
@@ -518,9 +518,9 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
 ; X86-CMOV-LABEL: cttz_i64_zero_test_knownneverzero:
 ; X86-CMOV:       # %bb.0:
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT:    rep bsfl %ecx, %edx
 ; X86-CMOV-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-CMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NEXT:    rep bsfl %ecx, %edx
 ; X86-CMOV-NEXT:    rep bsfl %eax, %eax
 ; X86-CMOV-NEXT:    orl $32, %eax
 ; X86-CMOV-NEXT:    testl %ecx, %ecx
@@ -672,9 +672,9 @@ define i64 @cttz_i32_sext(i32 %x) {
 ;
 ; X86-CMOV-LABEL: cttz_i32_sext:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $32, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $32, %ecx
+; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
 ; X86-CMOV-NEXT:    retl
 ;
@@ -727,9 +727,9 @@ define i64 @cttz_i32_zext(i32 %x) {
 ;
 ; X86-CMOV-LABEL: cttz_i32_zext:
 ; X86-CMOV:       # %bb.0:
-; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-CMOV-NEXT:    movl $32, %eax
-; X86-CMOV-NEXT:    cmovnel %ecx, %eax
+; X86-CMOV-NEXT:    movl $32, %ecx
+; X86-CMOV-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-CMOV-NEXT:    cmovel %ecx, %eax
 ; X86-CMOV-NEXT:    xorl %edx, %edx
 ; X86-CMOV-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/cvtv2f32.ll b/llvm/test/CodeGen/X86/cvtv2f32.ll
index 3875b72d5d68a..62232a65fcbdc 100644
--- a/llvm/test/CodeGen/X86/cvtv2f32.ll
+++ b/llvm/test/CodeGen/X86/cvtv2f32.ll
@@ -8,16 +8,16 @@
 define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) {
 ; X86-LABEL: uitofp_2i32_cvt_buildvector:
 ; X86:       # %bb.0:
-; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    movsd {{.*#+}} xmm2 = [4.503599627370496E+15,0.0E+0]
-; X86-NEXT:    orpd %xmm2, %xmm1
-; X86-NEXT:    subsd %xmm2, %xmm1
-; X86-NEXT:    cvtsd2ss %xmm1, %xmm1
+; X86-NEXT:    movsd {{.*#+}} xmm1 = [4.503599627370496E+15,0.0E+0]
+; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    orpd %xmm1, %xmm2
+; X86-NEXT:    subsd %xmm1, %xmm2
+; X86-NEXT:    cvtsd2ss %xmm2, %xmm2
 ; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT:    orpd %xmm2, %xmm3
-; X86-NEXT:    subsd %xmm2, %xmm3
-; X86-NEXT:    xorps %xmm2, %xmm2
-; X86-NEXT:    cvtsd2ss %xmm3, %xmm2
+; X86-NEXT:    orpd %xmm1, %xmm3
+; X86-NEXT:    subsd %xmm1, %xmm3
+; X86-NEXT:    xorps %xmm1, %xmm1
+; X86-NEXT:    cvtsd2ss %xmm3, %xmm1
 ; X86-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X86-NEXT:    mulps %xmm1, %xmm0
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/dagcombine-buildvector.ll b/llvm/test/CodeGen/X86/dagcombine-buildvector.ll
index 9ddb7a3d425ec..d337c33763caa 100644
--- a/llvm/test/CodeGen/X86/dagcombine-buildvector.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-buildvector.ll
@@ -7,8 +7,8 @@
 define void @test(ptr %dst, <4 x double> %src) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movaps %xmm0, (%eax)
 ; CHECK-NEXT:    retl
 entry:
@@ -21,8 +21,8 @@ define void @test2(ptr %src, ptr %dest) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movdqa %xmm0, (%eax)
 ; CHECK-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index 3efd536adc4d1..b5baafb9ca881 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -6,10 +6,10 @@ define i32 @t(ptr %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n
 ; X86-LABEL: t:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%eax,%ecx), %eax
+; X86-NEXT:    movl (%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t:
@@ -51,52 +51,53 @@ define i96 @square_high(i96 %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    adcl %edx, %ebp
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    movzbl %al, %ecx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movzbl %dl, %ecx
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %eax
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/dagcombine-dead-store.ll b/llvm/test/CodeGen/X86/dagcombine-dead-store.ll
index f509e19bedacf..35d2133335438 100644
--- a/llvm/test/CodeGen/X86/dagcombine-dead-store.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-dead-store.ll
@@ -76,8 +76,8 @@ define void @output_indexed_fs_diff(i32 %v, ptr %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %eax, 168(%ecx)
 ; CHECK-NEXT:    movl %eax, %fs:184(%ecx)
+; CHECK-NEXT:    movl %eax, 168(%ecx)
 ; CHECK-NEXT:    retl
   %p = getelementptr i32, ptr %b, i64 42
   %pa = addrspacecast ptr %p to ptr addrspace(257)
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index e9a1e8ed728a4..916d5852207b1 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -178,10 +178,10 @@ entry:
 define i64 @fun9(i32 zeroext %v) {
 ; X86-LABEL: fun9:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $4, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    andl $-16, %eax
+; X86-NEXT:    sarl $4, %edx
 ; X86-NEXT:    shrl $28, %edx
 ; X86-NEXT:    retl
 ;
@@ -299,10 +299,9 @@ entry:
 define void @g(i32 %a) nounwind {
 ; X86-LABEL: g:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $-4, %eax
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll f
@@ -366,27 +365,23 @@ define i32 @shift_zext_shl2(i8 zeroext %x) {
 define <4 x i32> @shift_zext_shl_vec(<4 x i8> %x) nounwind {
 ; X86-LABEL: shift_zext_shl_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $23, %ecx
+; X86-NEXT:    shll $6, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $63, %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $64, %ecx
 ; X86-NEXT:    shll $9, %ecx
-; X86-NEXT:    andl $63, %edx
-; X86-NEXT:    shll $8, %edx
-; X86-NEXT:    andl $31, %esi
-; X86-NEXT:    shll $7, %esi
-; X86-NEXT:    andl $23, %edi
-; X86-NEXT:    shll $6, %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: shift_zext_shl_vec:
@@ -407,27 +402,23 @@ define <4 x i32> @shift_zext_shl_vec(<4 x i8> %x) nounwind {
 define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
 ; X86-LABEL: shift_zext_shl2_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $23, %ecx
+; X86-NEXT:    shll $6, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $31, %ecx
+; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $63, %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl $23, %edi
-; X86-NEXT:    andl $31, %esi
-; X86-NEXT:    andl $63, %edx
 ; X86-NEXT:    andl $64, %ecx
 ; X86-NEXT:    shll $9, %ecx
-; X86-NEXT:    shll $8, %edx
-; X86-NEXT:    shll $7, %esi
-; X86-NEXT:    shll $6, %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: shift_zext_shl2_vec:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 00f2e012c8b12..5421cbaa420ed 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -13,8 +13,8 @@ define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    idivb {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movsbl %ah, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb %al, (%edx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
@@ -96,24 +96,23 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, 4(%edx)
-; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl %edx, 4(%ebx)
+; X86-NEXT:    movl %eax, (%ebx)
 ; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -152,391 +151,400 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    xorl %edi, %edx
 ; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    subl %edi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    subl %edx, %edi
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    sete %dl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    sete %cl
-; X86-NEXT:    orb %dl, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %ecx
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    xorl %edi, %ebx
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    subl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    bsrl %esi, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %edi
-; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    orl $64, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    sete %dl
+; X86-NEXT:    orb %cl, %dl
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %select.false.sink
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:  .LBB4_3: # %select.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    jne .LBB4_4
-; X86-NEXT:  # %bb.10: # %select.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    cmovnel %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_9
+; X86-NEXT:  # %bb.4: # %select.end
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    je .LBB4_11
-; X86-NEXT:  # %bb.8: # %udiv-bb1
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-bb1
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 152(%esp,%eax), %edx
-; X86-NEXT:    movl 156(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%esp,%eax), %esi
+; X86-NEXT:    movl 152(%esp,%eax), %esi
+; X86-NEXT:    movl 156(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 144(%esp,%eax), %ebx
 ; X86-NEXT:    movl 148(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %edx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB4_9
-; X86-NEXT:  # %bb.5: # %udiv-preheader
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    jb .LBB4_10
+; X86-NEXT:  # %bb.6: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 108(%esp,%eax), %ebx
+; X86-NEXT:    movl 108(%esp,%eax), %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl 104(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%eax), %esi
-; X86-NEXT:    movl 100(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NEXT:    movl 100(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB4_6: # %udiv-do-while
+; X86-NEXT:  .LBB4_7: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl $1, %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    leal (%eax,%eax), %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl $31, %edi, %eax
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    jne .LBB4_6
-; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    shldl $1, %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    leal (%edi,%edx,2), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:  .LBB4_11: # %udiv-end
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_7
+; X86-NEXT:  .LBB4_8: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    leal (%eax,%esi,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl $31, %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:  .LBB4_9: # %udiv-end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    xorl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    subl %edi, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl 56(%ebp), %eax
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl 40(%ebp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull 40(%ebp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull 44(%ebp), %edi
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movl 48(%ebp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl 52(%ebp), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    imull %edx, %esi
-; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 24(%ebp), %edx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %ecx, 4(%edx)
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, 8(%edx)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -545,14 +553,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB4_1:
 ; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_3
-; X86-NEXT:  .LBB4_9:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_4:
+; X86-NEXT:  .LBB4_10:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_11
+; X86-NEXT:    jmp .LBB4_8
 ;
 ; X64-LABEL: scalar_i128:
 ; X64:       # %bb.0:
@@ -673,11 +680,10 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; X86-NEXT:    movd %edi, %xmm2
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
 ; X86-NEXT:    movd %ebx, %xmm5
+; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    movd %ecx, %xmm6
-; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movd %eax, %xmm2
@@ -686,7 +692,8 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; X86-NEXT:    movdqa %xmm2, %xmm4
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; X86-NEXT:    movdqa %xmm4, (%ecx)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movdqa %xmm4, (%eax)
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    movdqa %xmm1, %xmm4
 ; X86-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -829,75 +836,73 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind {
 ; X86-LABEL: vector_i128_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pextrw $7, %xmm0, %eax
-; X86-NEXT:    pextrw $7, %xmm1, %esi
+; X86-NEXT:    pextrw $7, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    pextrw $6, %xmm0, %eax
-; X86-NEXT:    pextrw $6, %xmm1, %esi
+; X86-NEXT:    pextrw $6, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; X86-NEXT:    pextrw $5, %xmm0, %eax
-; X86-NEXT:    pextrw $5, %xmm1, %esi
+; X86-NEXT:    pextrw $5, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    pextrw $4, %xmm0, %eax
-; X86-NEXT:    pextrw $4, %xmm1, %esi
+; X86-NEXT:    pextrw $4, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    pextrw $3, %xmm0, %eax
-; X86-NEXT:    pextrw $3, %xmm1, %esi
+; X86-NEXT:    pextrw $3, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    pextrw $2, %xmm0, %eax
-; X86-NEXT:    pextrw $2, %xmm1, %esi
+; X86-NEXT:    pextrw $2, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; X86-NEXT:    pextrw $1, %xmm0, %eax
-; X86-NEXT:    pextrw $1, %xmm1, %esi
+; X86-NEXT:    pextrw $1, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm5
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; X86-NEXT:    movdqa %xmm5, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm5, (%eax)
 ; X86-NEXT:    pmullw %xmm1, %xmm5
 ; X86-NEXT:    psubw %xmm5, %xmm0
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vector_i128_i16:
@@ -979,38 +984,37 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
 define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounwind {
 ; X86-LABEL: vector_i128_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
 ; X86-NEXT:    movd %xmm2, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm2, %esi
+; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-NEXT:    movd %xmm2, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-NEXT:    movd %xmm2, %esi
+; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm4, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; X86-NEXT:    movd %xmm4, %esi
+; X86-NEXT:    movd %xmm4, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X86-NEXT:    movdqa %xmm3, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; X86-NEXT:    pmuludq %xmm1, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -1019,7 +1023,6 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounw
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    psubd %xmm3, %xmm0
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vector_i128_i32:
@@ -1073,11 +1076,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounw
 define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounwind {
 ; X86-LABEL: vector_i128_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
 ; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
@@ -1105,7 +1106,8 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; X86-NEXT:    movdqa %xmm3, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrlq $32, %xmm1
@@ -1120,7 +1122,6 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    psubq %xmm3, %xmm0
 ; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vector_i128_i64:
@@ -1218,20 +1219,18 @@ define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
 define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 ; X86-LABEL: extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %eax, (%edi)
-; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extrause:
@@ -1257,18 +1256,16 @@ define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_srem, ptr %sremdst) nounwind {
 ; X86-LABEL: multiple_bb:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
-; X86-NEXT:    movl %eax, (%edi)
-; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB12_2
 ; X86-NEXT:  # %bb.1: # %do_srem
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1279,7 +1276,6 @@ define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_srem, ptr
 ; X86-NEXT:  .LBB12_2: # %end
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: multiple_bb:
@@ -1313,20 +1309,16 @@ end:
 define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
 ; X86-LABEL: negative_different_x:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    imull %edi, %eax
+; X86-NEXT:    idivl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: negative_different_x:
@@ -1350,17 +1342,15 @@ define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind
 define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, ptr %divdst) nounwind {
 ; X86-LABEL: negative_different_y:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: negative_different_y:
@@ -1384,17 +1374,15 @@ define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, ptr %divdst)
 define i32 @negative_inverted_division(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
 ; X86-LABEL: negative_inverted_division:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
-; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
 ; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: negative_inverted_division:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 3d756f3cf2141..0e488c8703295 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -13,8 +13,8 @@ define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    divb {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl %ah, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb %al, (%edx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
@@ -96,24 +96,23 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, 4(%edx)
-; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl %edx, 4(%ebx)
+; X86-NEXT:    movl %eax, (%ebx)
 ; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -152,145 +151,145 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 48(%ebp), %ebx
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 52(%ebp), %esi
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete %cl
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    orl 36(%ebp), %eax
-; X86-NEXT:    movl 24(%ebp), %edx
-; X86-NEXT:    orl 32(%ebp), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    sete %al
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl 32(%ebp), %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl 28(%ebp), %ebx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebx, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl 48(%ebp), %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl 40(%ebp), %ebx
 ; X86-NEXT:    xorl $31, %ebx
 ; X86-NEXT:    orl $32, %ebx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ebx
 ; X86-NEXT:    orl $64, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    bsrl %edi, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 32(%ebp), %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    bsrl %eax, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl 24(%ebp), %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    orl $64, %edx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    orl 52(%ebp), %edx
+; X86-NEXT:    cmovnel %eax, %ebx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl 52(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    orl 48(%ebp), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    sete %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %select.false.sink
-; X86-NEXT:    movl $127, %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    movl $127, %ecx
+; X86-NEXT:    cmpl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:  .LBB4_3: # %select.end
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    jne .LBB4_9
-; X86-NEXT:  # %bb.4: # %select.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    je .LBB4_9
-; X86-NEXT:  # %bb.5: # %udiv-bb1
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    cmovnel %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    cmovnel %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    cmovnel %edi, %ecx
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.10: # %select.end
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    je .LBB4_11
+; X86-NEXT:  # %bb.8: # %udiv-bb1
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %esi
-; X86-NEXT:    movl 140(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edi
-; X86-NEXT:    movl 132(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl 136(%esp,%eax), %edx
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movl 132(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    addl $1, %ecx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %edx
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB4_10
-; X86-NEXT:  # %bb.6: # %udiv-preheader
+; X86-NEXT:    jb .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -300,201 +299,201 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl 92(%esp,%eax), %ebx
+; X86-NEXT:    movl 88(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
+; X86-NEXT:    movl 80(%esp,%eax), %esi
 ; X86-NEXT:    movl 84(%esp,%eax), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebp), %ecx
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB4_7: # %udiv-do-while
-; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    shldl $1, %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ebx, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB4_6: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl 52(%ebp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl 48(%ebp), %esi
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl 44(%ebp), %ebx
-; X86-NEXT:    andl 40(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 52(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl 44(%ebp), %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    andl 40(%ebp), %edi
+; X86-NEXT:    subl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    leal (%eax,%eax), %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    shrdl $31, %edx, %eax
+; X86-NEXT:    shrdl $31, %edi, %edx
+; X86-NEXT:    shldl $1, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    jne .LBB4_6
+; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
+; X86-NEXT:    leal (%ecx,%esi,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl $31, %edi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    jne .LBB4_7
-; X86-NEXT:  .LBB4_8: # %udiv-loop-exit
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    shldl $1, %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%edx,2), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:  .LBB4_9: # %udiv-end
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:  .LBB4_11: # %udiv-end
 ; X86-NEXT:    movl 56(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl 48(%ebp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl 52(%ebp), %edi
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull 40(%ebp), %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %edi, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull 44(%ebp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull 44(%ebp)
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %ecx, (%esi)
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl 36(%ebp), %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%esi)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, 12(%esi)
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -502,12 +501,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movb $1, %al
+; X86-NEXT:    movb $1, %cl
 ; X86-NEXT:    jmp .LBB4_3
-; X86-NEXT:  .LBB4_10:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jmp .LBB4_8
+; X86-NEXT:  .LBB4_9:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    jmp .LBB4_7
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_11
 ;
 ; X64-LABEL: scalar_i128:
 ; X64:       # %bb.0:
@@ -628,11 +631,10 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; X86-NEXT:    movd %edi, %xmm2
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
 ; X86-NEXT:    movd %ebx, %xmm5
+; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    movd %ecx, %xmm6
-; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movd %eax, %xmm2
@@ -641,7 +643,8 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; X86-NEXT:    movdqa %xmm2, %xmm4
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; X86-NEXT:    movdqa %xmm4, (%ecx)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movdqa %xmm4, (%eax)
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    movdqa %xmm1, %xmm4
 ; X86-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -784,75 +787,73 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind {
 ; X86-LABEL: vector_i128_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pextrw $7, %xmm0, %eax
-; X86-NEXT:    pextrw $7, %xmm1, %esi
+; X86-NEXT:    pextrw $7, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    pextrw $6, %xmm0, %eax
-; X86-NEXT:    pextrw $6, %xmm1, %esi
+; X86-NEXT:    pextrw $6, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; X86-NEXT:    pextrw $5, %xmm0, %eax
-; X86-NEXT:    pextrw $5, %xmm1, %esi
+; X86-NEXT:    pextrw $5, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    pextrw $4, %xmm0, %eax
-; X86-NEXT:    pextrw $4, %xmm1, %esi
+; X86-NEXT:    pextrw $4, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    pextrw $3, %xmm0, %eax
-; X86-NEXT:    pextrw $3, %xmm1, %esi
+; X86-NEXT:    pextrw $3, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    pextrw $2, %xmm0, %eax
-; X86-NEXT:    pextrw $2, %xmm1, %esi
+; X86-NEXT:    pextrw $2, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; X86-NEXT:    pextrw $1, %xmm0, %eax
-; X86-NEXT:    pextrw $1, %xmm1, %esi
+; X86-NEXT:    pextrw $1, %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divw %si
+; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movd %eax, %xmm5
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; X86-NEXT:    movdqa %xmm5, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm5, (%eax)
 ; X86-NEXT:    pmullw %xmm1, %xmm5
 ; X86-NEXT:    psubw %xmm5, %xmm0
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vector_i128_i16:
@@ -934,38 +935,37 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
 define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounwind {
 ; X86-LABEL: vector_i128_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
 ; X86-NEXT:    movd %xmm2, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm2, %esi
+; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-NEXT:    movd %xmm2, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-NEXT:    movd %xmm2, %esi
+; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm4, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; X86-NEXT:    movd %xmm4, %esi
+; X86-NEXT:    movd %xmm4, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm4
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X86-NEXT:    movdqa %xmm3, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
 ; X86-NEXT:    pmuludq %xmm1, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -974,7 +974,6 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounw
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    psubd %xmm3, %xmm0
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vector_i128_i32:
@@ -1028,11 +1027,9 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounw
 define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounwind {
 ; X86-LABEL: vector_i128_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
 ; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
@@ -1060,7 +1057,8 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; X86-NEXT:    movdqa %xmm3, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrlq $32, %xmm1
@@ -1075,7 +1073,6 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    psubq %xmm3, %xmm0
 ; X86-NEXT:    addl $64, %esp
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vector_i128_i64:
@@ -1148,20 +1145,18 @@ define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
 define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 ; X86-LABEL: extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %eax, (%edi)
-; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extrause:
@@ -1187,18 +1182,16 @@ define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
 define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_urem, ptr %uremdst) nounwind {
 ; X86-LABEL: multiple_bb:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %esi
-; X86-NEXT:    movl %eax, (%edi)
-; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB11_2
 ; X86-NEXT:  # %bb.1: # %do_urem
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1209,7 +1202,6 @@ define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_urem, ptr
 ; X86-NEXT:  .LBB11_2: # %end
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: multiple_bb:
@@ -1243,20 +1235,16 @@ end:
 define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
 ; X86-LABEL: negative_different_x:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    imull %edi, %eax
+; X86-NEXT:    divl %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: negative_different_x:
@@ -1280,17 +1268,15 @@ define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind
 define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, ptr %divdst) nounwind {
 ; X86-LABEL: negative_different_y:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: negative_different_y:
@@ -1314,17 +1300,15 @@ define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, ptr %divdst)
 define i32 @negative_inverted_division(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
 ; X86-LABEL: negative_inverted_division:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
-; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, (%edx)
 ; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: negative_inverted_division:
diff --git a/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll b/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
index 0924ea85a0126..c5ffa3e746950 100644
--- a/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
+++ b/llvm/test/CodeGen/X86/div_i129_v_pow2k.ll
@@ -49,40 +49,35 @@ define i129 @v_sdiv_i129_v_pow2k(i129 %lhs) nounwind {
 ;
 ; X86-LABEL: v_sdiv_i129_v_pow2k:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl $1, %ebx
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl $1, %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    shldl $31, %esi, %edx
-; X86-NEXT:    shldl $31, %edi, %esi
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movb %bl, 16(%eax)
+; X86-NEXT:    shrdl $1, %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    shrdl $1, %edx, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    shrdl $1, %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movb %cl, 16(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-O0-LABEL: v_sdiv_i129_v_pow2k:
@@ -162,8 +157,6 @@ define i129 @v_sdiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
 ;
 ; X86-LABEL: v_sdiv_exact_i129_v_pow2k:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -172,21 +165,19 @@ define i129 @v_sdiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    negl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    shrdl $1, %ebx, %edi
-; X86-NEXT:    shldl $31, %ebp, %edx
-; X86-NEXT:    shldl $31, %ebx, %ebp
 ; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    shldl $31, %edi, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrdl $1, %esi, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movb %cl, 16(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-O0-LABEL: v_sdiv_exact_i129_v_pow2k:
@@ -248,23 +239,21 @@ define i129 @v_udiv_i129_v_pow2k(i129 %lhs) nounwind {
 ;
 ; X86-LABEL: v_udiv_i129_v_pow2k:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $1, %esi, %edx
-; X86-NEXT:    shldl $31, %edi, %ecx
-; X86-NEXT:    shldl $31, %esi, %edi
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    shldl $31, %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    shldl $31, %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrdl $1, %ecx, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movb $0, 16(%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; X86-O0-LABEL: v_udiv_i129_v_pow2k:
@@ -319,23 +308,21 @@ define i129 @v_udiv_exact_i129_v_pow2k(i129 %lhs) nounwind {
 ;
 ; X86-LABEL: v_udiv_exact_i129_v_pow2k:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $1, %esi, %edx
-; X86-NEXT:    shldl $31, %edi, %ecx
-; X86-NEXT:    shldl $31, %esi, %edi
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    shldl $31, %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    shldl $31, %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrdl $1, %ecx, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movb $0, 16(%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; X86-O0-LABEL: v_udiv_exact_i129_v_pow2k:
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index e0bff50e2e2dd..b2de249243b6a 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -297,10 +297,12 @@ define i64 @PR23590(i64 %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $12345 # imm = 0x3039
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __umoddi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, %esi
@@ -612,8 +614,8 @@ define i64 @urem_i64_255(i64 %x) nounwind {
 ; X86-LABEL: urem_i64_255:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl $0, %eax
@@ -685,8 +687,8 @@ define i64 @urem_i64_65535(i64 %x) nounwind {
 ; X86-LABEL: urem_i64_65535:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl $0, %eax
@@ -1195,10 +1197,12 @@ define i64 @urem_i64_3_optsize(i64 %x) nounwind optsize {
 ; X86-LABEL: urem_i64_3_optsize:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __umoddi3
 ; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/divrem.ll b/llvm/test/CodeGen/X86/divrem.ll
index ba777b4954611..79d2750e0c55f 100644
--- a/llvm/test/CodeGen/X86/divrem.ll
+++ b/llvm/test/CodeGen/X86/divrem.ll
@@ -9,28 +9,27 @@ define void @si64(i64 %x, i64 %y, ptr %p, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __moddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    calll __divdi3
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, (%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edx, 4(%ecx)
 ; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    calll __moddi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 4(%esi)
+; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -56,15 +55,13 @@ define void @si64(i64 %x, i64 %y, ptr %p, ptr %q) nounwind {
 define void @si32(i32 %x, i32 %y, ptr %p, ptr %q) nounwind {
 ; X86-LABEL: si32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: si32:
@@ -86,15 +83,13 @@ define void @si32(i32 %x, i32 %y, ptr %p, ptr %q) nounwind {
 define void @si16(i16 %x, i16 %y, ptr %p, ptr %q) nounwind {
 ; X86-LABEL: si16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cwtd
 ; X86-NEXT:    idivw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    movw %dx, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %dx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: si16:
@@ -117,15 +112,13 @@ define void @si16(i16 %x, i16 %y, ptr %p, ptr %q) nounwind {
 define void @si8(i8 %x, i8 %y, ptr %p, ptr %q) nounwind {
 ; X86-LABEL: si8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    idivb {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movsbl %ah, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movsbl %ah, %ebx
 ; X86-NEXT:    movb %al, (%edx)
-; X86-NEXT:    movb %bl, (%ecx)
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: si8:
@@ -150,28 +143,27 @@ define void @ui64(i64 %x, i64 %y, ptr %p, ptr %q) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __umoddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, (%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edx, 4(%ecx)
 ; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    calll __umoddi3
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 4(%esi)
+; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -197,15 +189,13 @@ define void @ui64(i64 %x, i64 %y, ptr %p, ptr %q) nounwind {
 define void @ui32(i32 %x, i32 %y, ptr %p, ptr %q) nounwind {
 ; X86-LABEL: ui32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ui32:
@@ -227,15 +217,13 @@ define void @ui32(i32 %x, i32 %y, ptr %p, ptr %q) nounwind {
 define void @ui16(i16 %x, i16 %y, ptr %p, ptr %q) nounwind {
 ; X86-LABEL: ui16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divw {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    movw %dx, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %dx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ui16:
@@ -258,15 +246,13 @@ define void @ui16(i16 %x, i16 %y, ptr %p, ptr %q) nounwind {
 define void @ui8(i8 %x, i8 %y, ptr %p, ptr %q) nounwind {
 ; X86-LABEL: ui8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    divb {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl %ah, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl %ah, %ebx
 ; X86-NEXT:    movb %al, (%edx)
-; X86-NEXT:    movb %bl, (%ecx)
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ui8:
diff --git a/llvm/test/CodeGen/X86/divrem8_ext.ll b/llvm/test/CodeGen/X86/divrem8_ext.ll
index bfc982d2def5d..3480384a209be 100644
--- a/llvm/test/CodeGen/X86/divrem8_ext.ll
+++ b/llvm/test/CodeGen/X86/divrem8_ext.ll
@@ -49,8 +49,8 @@ define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
 define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
 ; X86-LABEL: test_urem_noext_ah:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    divb %cl
 ; X86-NEXT:    movzbl %ah, %eax
 ; X86-NEXT:    addb %cl, %al
@@ -137,8 +137,8 @@ define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
 define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
 ; X86-LABEL: test_srem_noext_ah:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    idivb %cl
 ; X86-NEXT:    movsbl %ah, %eax
 ; X86-NEXT:    addb %cl, %al
diff --git a/llvm/test/CodeGen/X86/dollar-name.ll b/llvm/test/CodeGen/X86/dollar-name.ll
index b997b5107f01a..7d77d07c4b310 100644
--- a/llvm/test/CodeGen/X86/dollar-name.ll
+++ b/llvm/test/CodeGen/X86/dollar-name.ll
@@ -12,8 +12,8 @@ define i32 @"$foo"() nounwind {
 ; STATIC-LABEL: $foo:
 ; STATIC:       # %bb.0:
 ; STATIC-NEXT:    movl ($arr), %eax
-; STATIC-NEXT:    movl %gs:0, %ecx
 ; STATIC-NEXT:    addl ($arr+4), %eax
+; STATIC-NEXT:    movl %gs:0, %ecx
 ; STATIC-NEXT:    addl ($tls at NTPOFF)(%ecx), %eax
 ; STATIC-NEXT:    pushl ($arr_h)
 ; STATIC-NEXT:    pushl %eax
diff --git a/llvm/test/CodeGen/X86/emutls-pic.ll b/llvm/test/CodeGen/X86/emutls-pic.ll
index fc57d9919ed05..464b61b0f09b4 100644
--- a/llvm/test/CodeGen/X86/emutls-pic.ll
+++ b/llvm/test/CodeGen/X86/emutls-pic.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -emulated-tls -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X86 %s
 ; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
 ; RUN: llc < %s -emulated-tls -mtriple=i386-linux-android -relocation-model=pic | FileCheck -check-prefix=X86 %s
@@ -19,13 +20,39 @@ declare ptr @my_emutls_get_address(ptr)
 
 define i32 @my_get_xyz() {
 ; X86-LABEL: my_get_xyz:
-; X86:      movl my_emutls_v_xyz at GOT(%ebx), %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: calll my_emutls_get_address at PLT
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    calll .L0$pb
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:  .L0$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:  .Ltmp0:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ebx
+; X86-NEXT:    movl my_emutls_v_xyz at GOT(%ebx), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll my_emutls_get_address at PLT
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
 ; X64-LABEL: my_get_xyz:
-; X64:      movq my_emutls_v_xyz at GOTPCREL(%rip), %rdi
-; X64-NEXT: callq my_emutls_get_address at PLT
-; X64-NEXT: movl (%rax), %eax
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    movq my_emutls_v_xyz at GOTPCREL(%rip), %rdi
+; X64-NEXT:    callq my_emutls_get_address at PLT
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
 
 entry:
   %call = call ptr @my_emutls_get_address(ptr @my_emutls_v_xyz)
@@ -38,6 +65,40 @@ entry:
 @k = internal thread_local global i32 0, align 8
 
 define i32 @f1() {
+; X86-LABEL: f1:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    calll .L1$pb
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:  .L1$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:  .Ltmp1:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %ebx
+; X86-NEXT:    movl __emutls_v.i at GOT(%ebx), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __emutls_get_address at PLT
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    movq __emutls_v.i at GOTPCREL(%rip), %rdi
+; X64-NEXT:    callq __emutls_get_address at PLT
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
 entry:
   %tmp1 = load i32, ptr @i
   ret i32 %tmp1
@@ -55,6 +116,38 @@ entry:
 @i2 = external thread_local global i32
 
 define ptr @f2() {
+; X86-LABEL: f2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    calll .L2$pb
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:  .L2$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:  .Ltmp2:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.L2$pb), %ebx
+; X86-NEXT:    movl __emutls_v.i at GOT(%ebx), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __emutls_get_address at PLT
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    movq __emutls_v.i at GOTPCREL(%rip), %rdi
+; X64-NEXT:    callq __emutls_get_address at PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
 entry:
   ret ptr @i
 }
@@ -64,6 +157,40 @@ entry:
 
 
 define i32 @f3() {
+; X86-LABEL: f3:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    calll .L3$pb
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:  .L3$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:  .Ltmp3:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp3-.L3$pb), %ebx
+; X86-NEXT:    movl __emutls_v.i at GOT(%ebx), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __emutls_get_address at PLT
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-LABEL: f3:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    movq __emutls_v.i at GOTPCREL(%rip), %rdi
+; X64-NEXT:    callq __emutls_get_address at PLT
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
 entry:
   %tmp1 = load i32, ptr @i  ; <i32> [#uses=1]
   ret i32 %tmp1
@@ -74,6 +201,29 @@ entry:
 
 
 define ptr @f4() nounwind {
+; X86-LABEL: f4:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    calll .L4$pb
+; X86-NEXT:  .L4$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp4:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.L4$pb), %ebx
+; X86-NEXT:    movl __emutls_v.i at GOT(%ebx), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __emutls_get_address at PLT
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: f4:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq __emutls_v.i at GOTPCREL(%rip), %rdi
+; X64-NEXT:    callq __emutls_get_address at PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
 entry:
   ret ptr @i
 }
@@ -83,6 +233,42 @@ entry:
 
 
 define i32 @f5() nounwind {
+; X86-LABEL: f5:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll .L5$pb
+; X86-NEXT:  .L5$pb:
+; X86-NEXT:    popl %ebx
+; X86-NEXT:  .Ltmp5:
+; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.L5$pb), %ebx
+; X86-NEXT:    leal __emutls_v.j at GOTOFF(%ebx), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __emutls_get_address at PLT
+; X86-NEXT:    leal __emutls_v.k at GOTOFF(%ebx), %ecx
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    calll __emutls_get_address at PLT
+; X86-NEXT:    addl (%eax), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: f5:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    leaq __emutls_v.j(%rip), %rdi
+; X64-NEXT:    callq __emutls_get_address at PLT
+; X64-NEXT:    movl (%rax), %ebx
+; X64-NEXT:    leaq __emutls_v.k(%rip), %rdi
+; X64-NEXT:    callq __emutls_get_address at PLT
+; X64-NEXT:    addl (%rax), %ebx
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
 entry:
   %0 = load i32, ptr @j, align 4
   %1 = load i32, ptr @k, align 4
@@ -181,3 +367,5 @@ entry:
 ; X64-NEXT: .quad 0
 
 ; X64-NOT:   __emutls_t.k:
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NoEMU: {{.*}}
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 2609b06361af5..e1b6c3f62a6cc 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -457,14 +457,14 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
 ; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 44
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovlps %xmm0, (%esp)
+; X86-NEXT:    vmovhps %xmm0, (%esp)
 ; X86-NEXT:    calll __truncdfhf2
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovhps %xmm0, (%esp)
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovq %xmm0, (%esp)
 ; X86-NEXT:    calll __truncdfhf2
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; X86-NEXT:    addl $40, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll
index 018b6c2d20f1e..cb6fac39d18aa 100644
--- a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll
@@ -8,8 +8,8 @@
 define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_fadd_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -33,8 +33,8 @@ declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_fsub_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vsubps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -58,8 +58,8 @@ declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_fmul_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -83,8 +83,8 @@ declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_fdiv_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vdivps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -108,11 +108,9 @@ declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_frem_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    vextractps $2, %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
 ; X86-NEXT:    calll fmodf
@@ -143,9 +141,9 @@ define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp)
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-NEXT:    vmovaps %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    addl $80, %esp
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_frem_v4f32:
@@ -230,8 +228,8 @@ declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
 define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_fabs_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -268,8 +266,8 @@ declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32)
 define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_sqrt_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vsqrtps %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -293,8 +291,8 @@ declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32)
 define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_fneg_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -331,11 +329,9 @@ declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32)
 define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
 ; X86-LABEL: vp_fma_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $84, %esp
 ; X86-NEXT:    vmovupd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
 ; X86-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
 ; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
@@ -373,9 +369,9 @@ define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) no
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X86-NEXT:    vmovaps %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    addl $84, %esp
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_fma_v4f32:
@@ -515,9 +511,9 @@ declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4
 define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
 ; X86-LABEL: vp_fmuladd_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index dbfa69d497698..c41e130877769 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -8,8 +8,8 @@
 define void @vp_add_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_add_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -33,8 +33,8 @@ declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_sub_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_sub_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -58,8 +58,8 @@ declare <4 x i32> @llvm.vp.sub.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_mul_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_mul_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -89,23 +89,21 @@ declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_sdiv_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm2
-; X86-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
-; X86-NEXT:    vpmaxud %xmm3, %xmm2, %xmm2
-; X86-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,1,2,3]
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm3
+; X86-NEXT:    vpmaxud %xmm2, %xmm3, %xmm3
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vextractps $1, %xmm1, %ecx
 ; X86-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    vmovd %xmm1, %edi
+; X86-NEXT:    vmovd %xmm1, %esi
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
+; X86-NEXT:    idivl %esi
 ; X86-NEXT:    vmovd %eax, %xmm2
 ; X86-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
 ; X86-NEXT:    vpextrd $2, %xmm1, %ecx
@@ -118,9 +116,9 @@ define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-NEXT:    vmovdqa %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_sdiv_v4i32:
@@ -268,23 +266,21 @@ declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_udiv_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm2
-; X86-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
-; X86-NEXT:    vpmaxud %xmm3, %xmm2, %xmm2
-; X86-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,1,2,3]
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm3
+; X86-NEXT:    vpmaxud %xmm2, %xmm3, %xmm3
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vextractps $1, %xmm1, %ecx
 ; X86-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    vmovd %xmm1, %edi
+; X86-NEXT:    vmovd %xmm1, %esi
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
+; X86-NEXT:    divl %esi
 ; X86-NEXT:    vmovd %eax, %xmm2
 ; X86-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
 ; X86-NEXT:    vpextrd $2, %xmm1, %ecx
@@ -297,9 +293,9 @@ define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-NEXT:    vmovdqa %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_udiv_v4i32:
@@ -447,23 +443,21 @@ declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_srem_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm2
-; X86-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
-; X86-NEXT:    vpmaxud %xmm3, %xmm2, %xmm2
-; X86-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,1,2,3]
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm3
+; X86-NEXT:    vpmaxud %xmm2, %xmm3, %xmm3
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vextractps $1, %xmm1, %ecx
 ; X86-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    vmovd %xmm1, %edi
+; X86-NEXT:    vmovd %xmm1, %esi
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
+; X86-NEXT:    idivl %esi
 ; X86-NEXT:    vmovd %edx, %xmm2
 ; X86-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
 ; X86-NEXT:    vpextrd $2, %xmm1, %ecx
@@ -476,9 +470,9 @@ define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; X86-NEXT:    vmovdqa %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_srem_v4i32:
@@ -626,23 +620,21 @@ declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_urem_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm2
-; X86-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [0,1,2,3]
-; X86-NEXT:    vpmaxud %xmm3, %xmm2, %xmm2
-; X86-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
+; X86-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [0,1,2,3]
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm3
+; X86-NEXT:    vpmaxud %xmm2, %xmm3, %xmm3
+; X86-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vextractps $1, %xmm1, %ecx
 ; X86-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    vmovd %xmm1, %edi
+; X86-NEXT:    vmovd %xmm1, %esi
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
+; X86-NEXT:    divl %esi
 ; X86-NEXT:    vmovd %edx, %xmm2
 ; X86-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
 ; X86-NEXT:    vpextrd $2, %xmm1, %ecx
@@ -655,9 +647,9 @@ define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; X86-NEXT:    vmovdqa %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: vp_urem_v4i32:
@@ -805,7 +797,6 @@ declare <4 x i32> @llvm.vp.urem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_ashr_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
 ; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
@@ -818,6 +809,7 @@ define void @vp_ashr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
 ; X86-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -878,7 +870,6 @@ declare <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_lshr_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
 ; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
@@ -891,6 +882,7 @@ define void @vp_lshr_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) noun
 ; X86-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -951,11 +943,11 @@ declare <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_shl_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_shl_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpslld $23, %xmm1, %xmm1
 ; X86-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vcvttps2dq %xmm1, %xmm1
 ; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1003,8 +995,8 @@ declare <4 x i32> @llvm.vp.shl.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_or_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_or_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1028,8 +1020,8 @@ declare <4 x i32> @llvm.vp.or.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_and_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_and_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1053,8 +1045,8 @@ declare <4 x i32> @llvm.vp.and.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_xor_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_xor_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1078,8 +1070,8 @@ declare <4 x i32> @llvm.vp.xor.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_abs_v4i32(<4 x i32> %a0, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_abs_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpabsd %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1106,8 +1098,8 @@ declare <4 x i32> @llvm.vp.abs.v4i32(<4 x i32>, i1 immarg, <4 x i1>, i32)
 define void @vp_smax_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_smax_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1135,8 +1127,8 @@ declare <4 x i32> @llvm.vp.smax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_smin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_smin_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1164,8 +1156,8 @@ declare <4 x i32> @llvm.vp.smin.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_umax_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_umax_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1196,8 +1188,8 @@ declare <4 x i32> @llvm.vp.umax.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
 define void @vp_umin_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
 ; X86-LABEL: vp_umin_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 90e075bfabf0a..0614c19ff4400 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -33,17 +33,14 @@ declare void @use64(i64)
 define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a0:
@@ -98,27 +95,24 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a0_arithmetic:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    sarl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    sarl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a0_arithmetic:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    sarl %cl, %edx
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    sarl %cl, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_a0_arithmetic:
@@ -165,17 +159,14 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) n
 define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a1_indexzext:
@@ -231,36 +222,33 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl (%eax), %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, (%eax), %eax
+; X86-BMI1-NEXT:    orl %eax, %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_a2_load:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
 ; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -301,36 +289,33 @@ define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl (%eax), %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, (%eax), %eax
+; X86-BMI1-NEXT:    orl %eax, %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_a3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
 ; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -372,17 +357,14 @@ define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a4_commutative:
@@ -437,38 +419,35 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, (%esp)
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    decl %esi
-; X86-NOBMI-NEXT:    andl %edi, %esi
-; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_a5_skipextrauses:
 ; X86-BMI1:       # %bb.0:
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl %al, %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    movzbl %al, %eax
+; X86-BMI1-NEXT:    orl %ecx, %eax
+; X86-BMI1-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    addl $8, %esp
@@ -479,11 +458,11 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %esi
-; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    addl $8, %esp
@@ -542,25 +521,27 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB7_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB7_2:
-; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB7_4
@@ -574,29 +555,32 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_a0:
 ; X86-BMI1:       # %bb.0:
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edi
 ; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB7_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB7_2:
-; X86-BMI1-NEXT:    movl $1, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    movb %ch, %cl
-; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB7_4
@@ -610,6 +594,7 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_a0:
@@ -618,21 +603,21 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl $1, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB7_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB7_2:
-; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    movl %ebx, %ecx
-; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB7_4
@@ -684,26 +669,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a0_arithmetic:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %esi
 ; X86-NOBMI-NEXT:    sarl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB8_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    sarl $31, %eax
+; X86-NOBMI-NEXT:    sarl $31, %ebx
 ; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %ebx, %esi
 ; X86-NOBMI-NEXT:  .LBB8_2:
-; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB8_4
@@ -717,30 +704,33 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 ; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_a0_arithmetic:
 ; X86-BMI1:       # %bb.0:
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %esi
 ; X86-BMI1-NEXT:    sarl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %edi
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB8_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    sarl $31, %eax
+; X86-BMI1-NEXT:    sarl $31, %ebx
 ; X86-BMI1-NEXT:    movl %esi, %edi
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl %ebx, %esi
 ; X86-BMI1-NEXT:  .LBB8_2:
-; X86-BMI1-NEXT:    movl $1, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    movb %ch, %cl
-; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB8_4
@@ -754,30 +744,32 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 ; X86-BMI1-NEXT:    andl %esi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_a0_arithmetic:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl $1, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    sarxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI2-NEXT:    shrdl %cl, %ebp, %esi
+; X86-BMI2-NEXT:    sarxl %ecx, %ebp, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB8_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    sarl $31, %eax
+; X86-BMI2-NEXT:    sarl $31, %ebp
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl %eax, %edi
+; X86-BMI2-NEXT:    movl %ebp, %edi
 ; X86-BMI2-NEXT:  .LBB8_2:
-; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    movl %ebx, %ecx
-; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB8_4
@@ -792,6 +784,7 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_a0_arithmetic:
@@ -830,25 +823,27 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB9_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB9_2:
-; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB9_4
@@ -862,29 +857,32 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_a1_indexzext:
 ; X86-BMI1:       # %bb.0:
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edi
 ; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB9_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB9_2:
-; X86-BMI1-NEXT:    movl $1, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    movb %ch, %cl
-; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB9_4
@@ -898,6 +896,7 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_a1_indexzext:
@@ -906,21 +905,21 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl $1, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB9_2
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB9_2:
-; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    movl %ebx, %ecx
-; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB9_4
@@ -976,26 +975,28 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl (%edi), %esi
+; X86-NOBMI-NEXT:    movl 4(%edi), %ebx
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl %ebx, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB10_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB10_2:
-; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB10_4
@@ -1009,30 +1010,33 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_a2_load:
 ; X86-BMI1:       # %bb.0:
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl (%edi), %esi
+; X86-BMI1-NEXT:    movl 4(%edi), %ebx
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %esi
-; X86-BMI1-NEXT:    movl 4(%eax), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl %ebx, %edi
 ; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB10_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB10_2:
-; X86-BMI1-NEXT:    movl $1, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    movb %ch, %cl
-; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB10_4
@@ -1046,30 +1050,32 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_a2_load:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl $1, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl (%eax), %esi
-; X86-BMI2-NEXT:    movl 4(%eax), %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl (%esi), %edi
+; X86-BMI2-NEXT:    movl 4(%esi), %ebp
+; X86-BMI2-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-BMI2-NEXT:    shrdl %cl, %ebp, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB10_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl %esi, %edi
+; X86-BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI2-NEXT:  .LBB10_2:
-; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    movl %ebx, %ecx
-; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB10_4
@@ -1079,11 +1085,12 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI2-NEXT:  .LBB10_4:
 ; X86-BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI2-NEXT:    adcl $-1, %edx
-; X86-BMI2-NEXT:    andl %esi, %eax
-; X86-BMI2-NEXT:    andl %edi, %edx
+; X86-BMI2-NEXT:    andl %edi, %eax
+; X86-BMI2-NEXT:    andl %esi, %edx
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_a2_load:
@@ -1123,26 +1130,28 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl (%edi), %esi
+; X86-NOBMI-NEXT:    movl 4(%edi), %ebx
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl %ebx, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB11_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB11_2:
-; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB11_4
@@ -1156,30 +1165,33 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_a3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl (%edi), %esi
+; X86-BMI1-NEXT:    movl 4(%edi), %ebx
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %esi
-; X86-BMI1-NEXT:    movl 4(%eax), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl %ebx, %edi
 ; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB11_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB11_2:
-; X86-BMI1-NEXT:    movl $1, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    movb %ch, %cl
-; X86-BMI1-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB11_4
@@ -1193,30 +1205,32 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_a3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl $1, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl (%eax), %esi
-; X86-BMI2-NEXT:    movl 4(%eax), %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl (%esi), %edi
+; X86-BMI2-NEXT:    movl 4(%esi), %ebp
+; X86-BMI2-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-BMI2-NEXT:    shrdl %cl, %ebp, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB11_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl %esi, %edi
+; X86-BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI2-NEXT:  .LBB11_2:
-; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI2-NEXT:    movl %ebx, %ecx
-; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB11_4
@@ -1226,11 +1240,12 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-BMI2-NEXT:  .LBB11_4:
 ; X86-BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI2-NEXT:    adcl $-1, %edx
-; X86-BMI2-NEXT:    andl %esi, %eax
-; X86-BMI2-NEXT:    andl %edi, %edx
+; X86-BMI2-NEXT:    andl %edi, %eax
+; X86-BMI2-NEXT:    andl %esi, %edx
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_a3_load_indexzext:
@@ -1274,25 +1289,27 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_a4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB12_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:  .LBB12_2:
-; X86-NOBMI-NEXT:    movl $1, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB12_4
@@ -1306,29 +1323,32 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_a4_commutative:
 ; X86-BMI1:       # %bb.0:
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
 ; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB12_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB12_2:
-; X86-BMI1-NEXT:    movl $1, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:    movb %ch, %cl
-; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB12_4
@@ -1342,6 +1362,7 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
+; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_a4_commutative:
@@ -1350,6 +1371,10 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl $1, %edi
+; X86-BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI2-NEXT:    shldl %cl, %edi, %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1361,10 +1386,6 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:  .LBB12_2:
-; X86-BMI2-NEXT:    movl $1, %edi
-; X86-BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI2-NEXT:    movl %ebx, %ecx
-; X86-BMI2-NEXT:    shldl %cl, %edi, %esi
 ; X86-BMI2-NEXT:    shlxl %ebx, %edi, %ecx
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB12_4
@@ -1421,26 +1442,30 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movl %edx, (%esp)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movl $1, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %esi, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    movl %eax, %ebp
+; X86-NOBMI-NEXT:    movb %dl, %cl
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NOBMI-NEXT:    testb $32, %al
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NOBMI-NEXT:    testb $32, %dl
 ; X86-NOBMI-NEXT:    je .LBB13_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
 ; X86-NOBMI-NEXT:  .LBB13_2:
-; X86-NOBMI-NEXT:    movl $1, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
-; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
+; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    testb $32, %dl
+; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB13_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
@@ -1450,11 +1475,7 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    adcl $-1, %edi
 ; X86-NOBMI-NEXT:    andl %ebx, %esi
 ; X86-NOBMI-NEXT:    andl %ebp, %edi
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
 ; X86-NOBMI-NEXT:    addl $12, %esp
@@ -1471,26 +1492,30 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movl %edx, (%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1-NEXT:    movl $1, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
+; X86-BMI1-NEXT:    movb %ch, %cl
+; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %esi, %ebp
-; X86-BMI1-NEXT:    movl %eax, %ecx
+; X86-BMI1-NEXT:    movl %eax, %ebp
+; X86-BMI1-NEXT:    movb %dl, %cl
 ; X86-BMI1-NEXT:    shrl %cl, %ebp
-; X86-BMI1-NEXT:    shrdl %cl, %esi, %ebx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %ebx
+; X86-BMI1-NEXT:    testb $32, %dl
 ; X86-BMI1-NEXT:    je .LBB13_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %ebp, %ebx
 ; X86-BMI1-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1-NEXT:  .LBB13_2:
-; X86-BMI1-NEXT:    movl $1, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:    movl %edx, %ecx
-; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
+; X86-BMI1-NEXT:    movb %ch, %cl
 ; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    testb $32, %dl
+; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB13_4
 ; X86-BMI1-NEXT:  # %bb.3:
 ; X86-BMI1-NEXT:    movl %esi, %edi
@@ -1500,11 +1525,7 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1-NEXT:    adcl $-1, %edi
 ; X86-BMI1-NEXT:    andl %ebx, %esi
 ; X86-BMI1-NEXT:    andl %ebp, %edi
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-BMI1-NEXT:    pushl %eax
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
 ; X86-BMI1-NEXT:    addl $12, %esp
@@ -1521,25 +1542,28 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %edx, (%esp)
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
-; X86-BMI2-NEXT:    shrxl %eax, %esi, %ebp
-; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB13_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %ebp, %ebx
-; X86-BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI2-NEXT:  .LBB13_2:
+; X86-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl $1, %edi
 ; X86-BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI2-NEXT:    movl %edx, %ecx
+; X86-BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI2-NEXT:    shldl %cl, %edi, %esi
-; X86-BMI2-NEXT:    shlxl %edx, %edi, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI2-NEXT:    movl %edx, %ecx
+; X86-BMI2-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-BMI2-NEXT:    shrxl %edx, %ebp, %ecx
 ; X86-BMI2-NEXT:    testb $32, %dl
+; X86-BMI2-NEXT:    je .LBB13_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %ecx, %ebx
+; X86-BMI2-NEXT:    xorl %ecx, %ecx
+; X86-BMI2-NEXT:  .LBB13_2:
+; X86-BMI2-NEXT:    shlxl %eax, %edi, %edi
+; X86-BMI2-NEXT:    testb $32, %al
 ; X86-BMI2-NEXT:    je .LBB13_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %edi, %esi
@@ -1548,12 +1572,8 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI2-NEXT:    addl $-1, %edi
 ; X86-BMI2-NEXT:    adcl $-1, %esi
 ; X86-BMI2-NEXT:    andl %ebx, %edi
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-BMI2-NEXT:    pushl %eax
+; X86-BMI2-NEXT:    andl %ecx, %esi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    movl %edi, %eax
 ; X86-BMI2-NEXT:    movl %esi, %edx
 ; X86-BMI2-NEXT:    addl $12, %esp
@@ -1616,70 +1636,62 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_a0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB14_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB14_2:
-; X86-NOBMI-NEXT:    movl $1, %edi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $1, %esi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:    testb $32, %dl
+; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB14_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:  .LBB14_4:
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_a0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB14_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB14_2:
-; X86-BMI1-NEXT:    movl $1, %edi
-; X86-BMI1-NEXT:    movl %edx, %ecx
-; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $1, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    xorl %eax, %eax
-; X86-BMI1-NEXT:    testb $32, %dl
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB14_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %edi, %eax
+; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:  .LBB14_4:
 ; X86-BMI1-NEXT:    decl %eax
-; X86-BMI1-NEXT:    andl %esi, %eax
+; X86-BMI1-NEXT:    andl %edx, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_a0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1689,16 +1701,16 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edx
 ; X86-BMI2-NEXT:  .LBB14_2:
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB14_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:  .LBB14_4:
 ; X86-BMI2-NEXT:    decl %eax
 ; X86-BMI2-NEXT:    andl %edx, %eax
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_a0:
@@ -1740,66 +1752,59 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_a1:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB15_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB15_2:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_a1:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB15_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB15_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_a1:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB15_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB15_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_a1:
@@ -1841,80 +1846,76 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr64_32_a1_trunc_extrause(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_a1_trunc_extrause:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl %edx, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB16_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:  .LBB16_2:
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $1, %esi
+; X86-NOBMI-NEXT:    shll %cl, %esi
+; X86-NOBMI-NEXT:    decl %esi
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
-; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %ebx, %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
-; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_a1_trunc_extrause:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    pushl %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    subl $8, %esp
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl %edx, %esi
-; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB16_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB16_2:
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
-; X86-BMI1-NEXT:    shll $8, %ebx
-; X86-BMI1-NEXT:    bextrl %ebx, %esi, %eax
-; X86-BMI1-NEXT:    addl $4, %esp
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    addl $8, %esp
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_a1_trunc_extrause:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    subl $8, %esp
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB16_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB16_2:
-; X86-BMI2-NEXT:    movl %esi, (%esp)
+; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %eax
-; X86-BMI2-NEXT:    addl $4, %esp
+; X86-BMI2-NEXT:    movl %esi, %eax
+; X86-BMI2-NEXT:    addl $8, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_a1_trunc_extrause:
@@ -1986,66 +1987,59 @@ define i32 @bextr64_32_a1_trunc_extrause(i64 %val, i64 %numskipbits, i32 %numlow
 define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_a2:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB17_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB17_2:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_a2:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB17_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB17_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_a2:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB17_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB17_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_a2:
@@ -2088,70 +2082,62 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_a3:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB18_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB18_2:
-; X86-NOBMI-NEXT:    movl $1, %edi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $1, %esi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:    testb $32, %dl
+; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB18_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:  .LBB18_4:
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_a3:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB18_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB18_2:
-; X86-BMI1-NEXT:    movl $1, %edi
-; X86-BMI1-NEXT:    movl %edx, %ecx
-; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $1, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    xorl %eax, %eax
-; X86-BMI1-NEXT:    testb $32, %dl
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB18_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %edi, %eax
+; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:  .LBB18_4:
 ; X86-BMI1-NEXT:    decl %eax
-; X86-BMI1-NEXT:    andl %esi, %eax
+; X86-BMI1-NEXT:    andl %edx, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_a3:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -2161,16 +2147,16 @@ define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edx
 ; X86-BMI2-NEXT:  .LBB18_2:
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB18_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl $1, %eax
-; X86-BMI2-NEXT:    shlxl %ebx, %eax, %eax
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:  .LBB18_4:
 ; X86-BMI2-NEXT:    decl %eax
 ; X86-BMI2-NEXT:    andl %edx, %eax
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_a3:
@@ -2215,17 +2201,14 @@ define i32 @bextr64_32_a3(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_b0:
@@ -2280,17 +2263,14 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_b1_indexzext:
@@ -2346,36 +2326,33 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl (%eax), %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_b2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, (%eax), %eax
+; X86-BMI1-NEXT:    orl %eax, %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_b2_load:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
 ; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -2416,36 +2393,33 @@ define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl (%eax), %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_b3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, (%eax), %eax
+; X86-BMI1-NEXT:    orl %eax, %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_b3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
 ; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -2487,17 +2461,14 @@ define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_b4_commutative:
@@ -2552,38 +2523,35 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_b5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, (%esp)
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    notl %esi
-; X86-NOBMI-NEXT:    andl %edi, %esi
-; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_b5_skipextrauses:
 ; X86-BMI1:       # %bb.0:
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl %al, %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    movzbl %al, %eax
+; X86-BMI1-NEXT:    orl %ecx, %eax
+; X86-BMI1-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    addl $8, %esp
@@ -2594,11 +2562,11 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %esi
-; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    addl $8, %esp
@@ -2660,36 +2628,32 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB25_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB25_2:
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %ebx
-; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    jne .LBB25_3
-; X86-NOBMI-NEXT:  # %bb.4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    jmp .LBB25_5
-; X86-NOBMI-NEXT:  .LBB25_3:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:  .LBB25_5:
-; X86-NOBMI-NEXT:    notl %edx
-; X86-NOBMI-NEXT:    andl %edi, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB25_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:  .LBB25_4:
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -2700,31 +2664,30 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB25_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %esi
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB25_2:
-; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %ebx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB25_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edx, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB25_4:
-; X86-BMI1-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1-NEXT:    andnl %edx, %eax, %edx
+; X86-BMI1-NEXT:    andnl %edi, %esi, %eax
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
@@ -2732,32 +2695,32 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: bextr64_b0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB25_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB25_2:
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ebx, %esi, %ecx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %edi
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB25_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ecx, %esi
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB25_4:
-; X86-BMI2-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI2-NEXT:    andnl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    andnl %edi, %edx, %edx
+; X86-BMI2-NEXT:    andnl %esi, %eax, %eax
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_b0:
@@ -2798,36 +2761,32 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB26_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB26_2:
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %ebx
-; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    jne .LBB26_3
-; X86-NOBMI-NEXT:  # %bb.4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    jmp .LBB26_5
-; X86-NOBMI-NEXT:  .LBB26_3:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:  .LBB26_5:
-; X86-NOBMI-NEXT:    notl %edx
-; X86-NOBMI-NEXT:    andl %edi, %edx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB26_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:  .LBB26_4:
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -2838,31 +2797,30 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB26_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %esi
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB26_2:
-; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %ebx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB26_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edx, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB26_4:
-; X86-BMI1-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1-NEXT:    andnl %edx, %eax, %edx
+; X86-BMI1-NEXT:    andnl %edi, %esi, %eax
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
@@ -2870,32 +2828,32 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X86-BMI2-LABEL: bextr64_b1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB26_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB26_2:
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ebx, %esi, %ecx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %edi
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB26_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ecx, %esi
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB26_4:
-; X86-BMI2-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI2-NEXT:    andnl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    andnl %edi, %edx, %edx
+; X86-BMI2-NEXT:    andnl %esi, %eax, %eax
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_b1_indexzext:
@@ -2940,37 +2898,33 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB27_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB27_2:
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %ebx
-; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    jne .LBB27_3
-; X86-NOBMI-NEXT:  # %bb.4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    jmp .LBB27_5
-; X86-NOBMI-NEXT:  .LBB27_3:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:  .LBB27_5:
-; X86-NOBMI-NEXT:    notl %edx
-; X86-NOBMI-NEXT:    andl %edi, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl (%ecx), %edi
+; X86-NOBMI-NEXT:    movl 4(%ecx), %ebx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ebx, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB27_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:  .LBB27_4:
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -2978,68 +2932,67 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ;
 ; X86-BMI1-LABEL: bextr64_b2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl (%ecx), %eax
+; X86-BMI1-NEXT:    movl 4(%ecx), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl (%edx), %esi
-; X86-BMI1-NEXT:    movl 4(%edx), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
+; X86-BMI1-NEXT:    movl %esi, %edx
 ; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB27_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB27_2:
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %ebx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB27_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB27_4:
-; X86-BMI1-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1-NEXT:    andnl %eax, %edi, %eax
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_b2_load:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl (%edx), %eax
-; X86-BMI2-NEXT:    movl 4(%edx), %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %eax
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB27_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB27_2:
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ebx, %esi, %ecx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    movl (%edi), %esi
+; X86-BMI2-NEXT:    movl 4(%edi), %ebx
+; X86-BMI2-NEXT:    shrxl %ecx, %ebx, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %ebx, %esi
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB27_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ecx, %esi
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB27_4:
-; X86-BMI2-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI2-NEXT:    andnl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    andnl %edi, %edx, %edx
+; X86-BMI2-NEXT:    andnl %esi, %eax, %eax
 ; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
@@ -3083,37 +3036,33 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB28_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB28_2:
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %ebx
-; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    jne .LBB28_3
-; X86-NOBMI-NEXT:  # %bb.4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    jmp .LBB28_5
-; X86-NOBMI-NEXT:  .LBB28_3:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:  .LBB28_5:
-; X86-NOBMI-NEXT:    notl %edx
-; X86-NOBMI-NEXT:    andl %edi, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl (%ecx), %edi
+; X86-NOBMI-NEXT:    movl 4(%ecx), %ebx
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ebx, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB28_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:  .LBB28_4:
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -3121,68 +3070,67 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 ;
 ; X86-BMI1-LABEL: bextr64_b3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl (%ecx), %eax
+; X86-BMI1-NEXT:    movl 4(%ecx), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl (%edx), %esi
-; X86-BMI1-NEXT:    movl 4(%edx), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
+; X86-BMI1-NEXT:    movl %esi, %edx
 ; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB28_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB28_2:
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %ebx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB28_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB28_4:
-; X86-BMI1-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1-NEXT:    andnl %edx, %esi, %edx
+; X86-BMI1-NEXT:    andnl %eax, %edi, %eax
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_b3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl (%edx), %eax
-; X86-BMI2-NEXT:    movl 4(%edx), %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %eax
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB28_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB28_2:
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ebx, %esi, %ecx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    movl (%edi), %esi
+; X86-BMI2-NEXT:    movl 4(%edi), %ebx
+; X86-BMI2-NEXT:    shrxl %ecx, %ebx, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %ebx, %esi
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB28_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ecx, %esi
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB28_4:
-; X86-BMI2-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI2-NEXT:    andnl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    andnl %edi, %edx, %edx
+; X86-BMI2-NEXT:    andnl %esi, %eax, %eax
 ; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
@@ -3230,36 +3178,32 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %esi, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
-; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB29_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
 ; X86-NOBMI-NEXT:  .LBB29_2:
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %ebx
-; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    jne .LBB29_3
-; X86-NOBMI-NEXT:  # %bb.4:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NOBMI-NEXT:    movl %ebx, %esi
-; X86-NOBMI-NEXT:    jmp .LBB29_5
-; X86-NOBMI-NEXT:  .LBB29_3:
-; X86-NOBMI-NEXT:    movl %ebx, %edi
-; X86-NOBMI-NEXT:  .LBB29_5:
-; X86-NOBMI-NEXT:    notl %edi
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    notl %esi
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB29_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:  .LBB29_4:
+; X86-NOBMI-NEXT:    notl %eax
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -3270,31 +3214,30 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB29_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %esi
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB29_2:
-; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %ebx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB29_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edx, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB29_4:
-; X86-BMI1-NEXT:    andnl %edx, %edi, %edx
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %eax
+; X86-BMI1-NEXT:    andnl %edx, %eax, %edx
+; X86-BMI1-NEXT:    andnl %edi, %esi, %eax
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
@@ -3302,32 +3245,32 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ;
 ; X86-BMI2-LABEL: bextr64_b4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
+; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB29_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB29_2:
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ebx, %esi, %ecx
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %edi
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB29_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ecx, %esi
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB29_4:
-; X86-BMI2-NEXT:    andnl %edx, %esi, %edx
-; X86-BMI2-NEXT:    andnl %eax, %ecx, %eax
+; X86-BMI2-NEXT:    andnl %edi, %edx, %edx
+; X86-BMI2-NEXT:    andnl %esi, %eax, %eax
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_b4_commutative:
@@ -3370,44 +3313,39 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %esi, %ebp
-; X86-NOBMI-NEXT:    movb %al, %cl
-; X86-NOBMI-NEXT:    shrl %cl, %ebp
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:    testb $32, %al
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB30_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebp, %edx
-; X86-NOBMI-NEXT:    xorl %ebp, %ebp
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB30_2:
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movb %ch, %cl
-; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    jne .LBB30_3
-; X86-NOBMI-NEXT:  # %bb.4:
-; X86-NOBMI-NEXT:    movl %esi, %ebx
-; X86-NOBMI-NEXT:    jmp .LBB30_5
-; X86-NOBMI-NEXT:  .LBB30_3:
-; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:  .LBB30_5:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NOBMI-NEXT:    movl %ebp, %edx
+; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-NOBMI-NEXT:    testb $32, %al
+; X86-NOBMI-NEXT:    je .LBB30_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %edx, %ebx
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:  .LBB30_4:
 ; X86-NOBMI-NEXT:    notl %edi
-; X86-NOBMI-NEXT:    andl %ebp, %edi
-; X86-NOBMI-NEXT:    notl %ebx
-; X86-NOBMI-NEXT:    andl %edx, %ebx
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    pushl %eax
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    notl %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    movl %ebx, %eax
-; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
@@ -3417,62 +3355,56 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ;
 ; X86-BMI1-LABEL: bextr64_b5_skipextrauses:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    subl $20, %esp
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %ebx, %esi
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edi
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %ecx, (%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB30_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edi
-; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB30_2:
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl %edx, %ecx
-; X86-BMI1-NEXT:    shll %cl, %ebp
-; X86-BMI1-NEXT:    testb $32, %dl
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %edi
+; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB30_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebp, %ebx
-; X86-BMI1-NEXT:    xorl %ebp, %ebp
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB30_4:
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %esi
-; X86-BMI1-NEXT:    andnl %edi, %ebp, %edi
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-BMI1-NEXT:    pushl %eax
+; X86-BMI1-NEXT:    andnl %edx, %esi, %esi
+; X86-BMI1-NEXT:    andnl %eax, %edi, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    movl %esi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $20, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_b5_skipextrauses:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    subl $16, %esp
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -3481,29 +3413,24 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:  .LBB30_2:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shlxl %ebx, %esi, %edi
+; X86-BMI2-NEXT:    shlxl %ebx, %esi, %ecx
 ; X86-BMI2-NEXT:    testb $32, %bl
 ; X86-BMI2-NEXT:    je .LBB30_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-BMI2-NEXT:  .LBB30_4:
 ; X86-BMI2-NEXT:    andnl %edx, %esi, %esi
-; X86-BMI2-NEXT:    andnl %eax, %edi, %edi
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebp
-; X86-BMI2-NEXT:    pushl %ecx
+; X86-BMI2-NEXT:    andnl %eax, %ecx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    movl %edi, %eax
 ; X86-BMI2-NEXT:    movl %esi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_b5_skipextrauses:
@@ -3559,87 +3486,79 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_b0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB31_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:  .LBB31_2:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    testb $32, %dl
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB31_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ecx
+; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:  .LBB31_4:
-; X86-NOBMI-NEXT:    notl %ecx
-; X86-NOBMI-NEXT:    andl %ecx, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_b0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB31_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB31_2:
-; X86-BMI1-NEXT:    movl $-1, %esi
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    xorl %ecx, %ecx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %edx
+; X86-BMI1-NEXT:    shll %cl, %edx
+; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB31_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ecx
+; X86-BMI1-NEXT:    movl %edx, %esi
 ; X86-BMI1-NEXT:  .LBB31_4:
-; X86-BMI1-NEXT:    andnl %edx, %ecx, %eax
+; X86-BMI1-NEXT:    andnl %eax, %esi, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_b0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB31_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB31_2:
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI2-NEXT:    testb $32, %al
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB31_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl $-1, %ecx
-; X86-BMI2-NEXT:    shlxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %edx
 ; X86-BMI2-NEXT:  .LBB31_4:
-; X86-BMI2-NEXT:    andnl %edx, %ecx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    andnl %eax, %edx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_b0:
@@ -3682,66 +3601,59 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_b1:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB32_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB32_2:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_b1:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB32_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB32_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_b1:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB32_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB32_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_b1:
@@ -3784,66 +3696,59 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_b2:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB33_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB33_2:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_b2:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB33_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB33_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_b2:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB33_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB33_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_b2:
@@ -3887,87 +3792,79 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr64_32_b3(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_32_b3:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB34_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:  .LBB34_2:
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    xorl %ecx, %ecx
-; X86-NOBMI-NEXT:    testb $32, %dl
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB34_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ecx
+; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:  .LBB34_4:
-; X86-NOBMI-NEXT:    notl %ecx
-; X86-NOBMI-NEXT:    andl %ecx, %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl %edx, %eax
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_32_b3:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB34_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
-; X86-BMI1-NEXT:  .LBB34_2:
-; X86-BMI1-NEXT:    movl $-1, %esi
-; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    xorl %ecx, %ecx
-; X86-BMI1-NEXT:    testb $32, %al
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:  .LBB34_2:
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl $-1, %edx
+; X86-BMI1-NEXT:    shll %cl, %edx
+; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB34_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ecx
+; X86-BMI1-NEXT:    movl %edx, %esi
 ; X86-BMI1-NEXT:  .LBB34_4:
-; X86-BMI1-NEXT:    andnl %edx, %ecx, %eax
+; X86-BMI1-NEXT:    andnl %eax, %esi, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_b3:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB34_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB34_2:
-; X86-BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI2-NEXT:    testb $32, %al
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB34_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl $-1, %ecx
-; X86-BMI2-NEXT:    shlxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %edx
 ; X86-BMI2-NEXT:  .LBB34_4:
-; X86-BMI2-NEXT:    andnl %edx, %ecx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    andnl %eax, %edx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_b3:
@@ -4014,64 +3911,59 @@ define i32 @bextr64_32_b3(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
-; X86-NOBMI-NEXT:    andl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_c0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    pushl %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    subl $8, %esp
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    andl %eax, %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
-; X86-BMI1-NEXT:    andl %edi, %esi
 ; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    addl $4, %esp
+; X86-BMI1-NEXT:    addl $8, %esp
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_c0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    subl $8, %esp
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    bzhil %ebx, %eax, %eax
-; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %eax
-; X86-BMI2-NEXT:    addl $4, %esp
+; X86-BMI2-NEXT:    movl %esi, %eax
+; X86-BMI2-NEXT:    addl $8, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c0:
@@ -4144,64 +4036,59 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
-; X86-NOBMI-NEXT:    andl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    pushl %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    subl $8, %esp
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    andl %eax, %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
-; X86-BMI1-NEXT:    andl %edi, %esi
 ; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    addl $4, %esp
+; X86-BMI1-NEXT:    addl $8, %esp
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_c1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    subl $8, %esp
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    bzhil %ebx, %eax, %eax
-; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %eax
-; X86-BMI2-NEXT:    addl $4, %esp
+; X86-BMI2-NEXT:    movl %esi, %eax
+; X86-BMI2-NEXT:    addl $8, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c1_indexzext:
@@ -4276,67 +4163,62 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl (%ecx), %esi
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
-; X86-NOBMI-NEXT:    andl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_c2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    pushl %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    subl $8, %esp
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl (%ecx), %esi
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    andl %eax, %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
-; X86-BMI1-NEXT:    andl %edi, %esi
 ; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    addl $4, %esp
+; X86-BMI1-NEXT:    addl $8, %esp
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    subl $8, %esp
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %esi
-; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    bzhil %ebx, %eax, %eax
-; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    bzhil %ecx, %edx, %edx
+; X86-BMI2-NEXT:    movl %edx, (%esp)
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %eax
-; X86-BMI2-NEXT:    addl $4, %esp
+; X86-BMI2-NEXT:    movl %esi, %eax
+; X86-BMI2-NEXT:    addl $8, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c2_load:
@@ -4410,67 +4292,62 @@ define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl (%ecx), %esi
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
-; X86-NOBMI-NEXT:    andl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    pushl %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    subl $8, %esp
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl (%ecx), %esi
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    andl %eax, %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
-; X86-BMI1-NEXT:    andl %edi, %esi
 ; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    addl $4, %esp
+; X86-BMI1-NEXT:    addl $8, %esp
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    subl $8, %esp
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, (%eax), %esi
-; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    bzhil %ebx, %eax, %eax
-; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    bzhil %ecx, %edx, %edx
+; X86-BMI2-NEXT:    movl %edx, (%esp)
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %eax
-; X86-BMI2-NEXT:    addl $4, %esp
+; X86-BMI2-NEXT:    movl %esi, %eax
+; X86-BMI2-NEXT:    addl $8, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c3_load_indexzext:
@@ -4546,64 +4423,59 @@ define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    andl %eax, %esi
 ; X86-NOBMI-NEXT:    calll use32 at PLT
-; X86-NOBMI-NEXT:    andl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    addl $8, %esp
 ; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_c4_commutative:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    pushl %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    subl $8, %esp
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    andl %eax, %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
-; X86-BMI1-NEXT:    andl %edi, %esi
 ; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    addl $4, %esp
+; X86-BMI1-NEXT:    addl $8, %esp
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_c4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    pushl %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-BMI2-NEXT:    subl $8, %esp
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    bzhil %ebx, %eax, %eax
-; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %eax
-; X86-BMI2-NEXT:    addl $4, %esp
+; X86-BMI2-NEXT:    movl %esi, %eax
+; X86-BMI2-NEXT:    addl $8, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c4_commutative:
@@ -4676,78 +4548,72 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $16, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    movl %ebx, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    movl %edi, (%esp)
 ; X86-NOBMI-NEXT:    calll use32 at PLT
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, (%esp)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    andl %edi, %esi
-; X86-NOBMI-NEXT:    movl %ebx, (%esp)
 ; X86-NOBMI-NEXT:    calll use32 at PLT
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    addl $4, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr32_c5_skipextrauses:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $16, %esp
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI1-NEXT:    movl %ebx, %ecx
-; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    pushl %eax
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %edi
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    movl %edi, (%esp)
 ; X86-BMI1-NEXT:    calll use32 at PLT
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %ecx, (%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-BMI1-NEXT:    shrl %cl, %esi
 ; X86-BMI1-NEXT:    andl %edi, %esi
-; X86-BMI1-NEXT:    movl %ebx, (%esp)
 ; X86-BMI1-NEXT:    calll use32 at PLT
 ; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    addl $16, %esp
+; X86-BMI1-NEXT:    addl $4, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_c5_skipextrauses:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $16, %esp
+; X86-BMI2-NEXT:    pushl %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI2-NEXT:    shrxl %edi, {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movl $-1, %eax
 ; X86-BMI2-NEXT:    bzhil %ebx, %eax, %eax
 ; X86-BMI2-NEXT:    movl %eax, (%esp)
 ; X86-BMI2-NEXT:    calll use32 at PLT
-; X86-BMI2-NEXT:    bzhil %ebx, %esi, %esi
-; X86-BMI2-NEXT:    movl %edi, (%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    bzhil %ebx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
 ; X86-BMI2-NEXT:    movl %esi, %eax
-; X86-BMI2-NEXT:    addl $16, %esp
+; X86-BMI2-NEXT:    addl $4, %esp
 ; X86-BMI2-NEXT:    popl %esi
-; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
@@ -4832,138 +4698,137 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB41_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB41_2:
+; X86-NOBMI-NEXT:    subl $16, %esp
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %ebp
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB41_4
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    jne .LBB41_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:  .LBB41_2:
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    movl $0, %edi
+; X86-NOBMI-NEXT:    jne .LBB41_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:  .LBB41_4:
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB41_6
+; X86-NOBMI-NEXT:  # %bb.5:
+; X86-NOBMI-NEXT:    movl %edx, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:  .LBB41_6:
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    andl %edx, %edi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    andl %ebp, %esi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_c0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB41_2
-; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:  .LBB41_2:
+; X86-BMI1-NEXT:    subl $16, %esp
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB41_4
+; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    jne .LBB41_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:  .LBB41_2:
+; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    movl $0, %edi
+; X86-BMI1-NEXT:    jne .LBB41_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %ebp
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %eax, %edi
 ; X86-BMI1-NEXT:  .LBB41_4:
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl %ebx
-; X86-BMI1-NEXT:    pushl %ebp
+; X86-BMI1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB41_6
+; X86-BMI1-NEXT:  # %bb.5:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB41_6:
+; X86-BMI1-NEXT:    andl %eax, %esi
+; X86-BMI1-NEXT:    andl %edx, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    andl %ebp, %esi
-; X86-BMI1-NEXT:    andl %ebx, %edi
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_c0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    subl $20, %esp
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB41_2
+; X86-BMI2-NEXT:    movl %eax, %esi
+; X86-BMI2-NEXT:    jne .LBB41_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl $-1, %esi
 ; X86-BMI2-NEXT:  .LBB41_2:
-; X86-BMI2-NEXT:    movb $64, %al
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
-; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB41_4
+; X86-BMI2-NEXT:    movl %esi, (%esp)
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB41_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI2-NEXT:    movl %eax, %edi
 ; X86-BMI2-NEXT:  .LBB41_4:
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %ebp
+; X86-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB41_6
+; X86-BMI2-NEXT:  # %bb.5:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB41_6:
+; X86-BMI2-NEXT:    andl %eax, %esi
+; X86-BMI2-NEXT:    andl %edx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    movl %edi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $20, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
-; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c0:
@@ -5036,138 +4901,137 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB42_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB42_2:
+; X86-NOBMI-NEXT:    subl $16, %esp
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %ebp
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB42_4
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    jne .LBB42_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:  .LBB42_2:
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    movl $0, %edi
+; X86-NOBMI-NEXT:    jne .LBB42_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:  .LBB42_4:
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB42_6
+; X86-NOBMI-NEXT:  # %bb.5:
+; X86-NOBMI-NEXT:    movl %edx, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:  .LBB42_6:
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    andl %edx, %edi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    andl %ebp, %esi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_c1_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB42_2
-; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:  .LBB42_2:
+; X86-BMI1-NEXT:    subl $16, %esp
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB42_4
+; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    jne .LBB42_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:  .LBB42_2:
+; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    movl $0, %edi
+; X86-BMI1-NEXT:    jne .LBB42_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %ebp
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %eax, %edi
 ; X86-BMI1-NEXT:  .LBB42_4:
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl %ebx
-; X86-BMI1-NEXT:    pushl %ebp
+; X86-BMI1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB42_6
+; X86-BMI1-NEXT:  # %bb.5:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB42_6:
+; X86-BMI1-NEXT:    andl %eax, %esi
+; X86-BMI1-NEXT:    andl %edx, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    andl %ebp, %esi
-; X86-BMI1-NEXT:    andl %ebx, %edi
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_c1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    subl $20, %esp
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB42_2
+; X86-BMI2-NEXT:    movl %eax, %esi
+; X86-BMI2-NEXT:    jne .LBB42_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl $-1, %esi
 ; X86-BMI2-NEXT:  .LBB42_2:
-; X86-BMI2-NEXT:    movb $64, %al
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
-; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB42_4
+; X86-BMI2-NEXT:    movl %esi, (%esp)
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB42_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI2-NEXT:    movl %eax, %edi
 ; X86-BMI2-NEXT:  .LBB42_4:
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %ebp
+; X86-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB42_6
+; X86-BMI2-NEXT:  # %bb.5:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB42_6:
+; X86-BMI2-NEXT:    andl %eax, %esi
+; X86-BMI2-NEXT:    andl %edx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    movl %edi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $20, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
-; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c1_indexzext:
@@ -5243,141 +5107,142 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB43_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB43_2:
+; X86-NOBMI-NEXT:    subl $16, %esp
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %ebp
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB43_4
+; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    jne .LBB43_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:  .LBB43_2:
+; X86-NOBMI-NEXT:    movl (%edi), %eax
+; X86-NOBMI-NEXT:    movl 4(%edi), %ebx
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    movl $0, %edi
+; X86-NOBMI-NEXT:    jne .LBB43_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %edx, %edi
 ; X86-NOBMI-NEXT:  .LBB43_4:
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB43_6
+; X86-NOBMI-NEXT:  # %bb.5:
+; X86-NOBMI-NEXT:    movl %edx, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:  .LBB43_6:
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    andl %edx, %edi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    andl %ebp, %esi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_c2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %esi
-; X86-BMI1-NEXT:    movl 4(%eax), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB43_2
-; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:  .LBB43_2:
+; X86-BMI1-NEXT:    subl $16, %esp
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    movl $-1, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB43_4
+; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    jne .LBB43_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:  .LBB43_2:
+; X86-BMI1-NEXT:    movl (%edi), %eax
+; X86-BMI1-NEXT:    movl 4(%edi), %ebx
+; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    movl $0, %edi
+; X86-BMI1-NEXT:    jne .LBB43_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %ebp
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edx, %edi
 ; X86-BMI1-NEXT:  .LBB43_4:
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl %ebx
-; X86-BMI1-NEXT:    pushl %ebp
+; X86-BMI1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB43_6
+; X86-BMI1-NEXT:  # %bb.5:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB43_6:
+; X86-BMI1-NEXT:    andl %eax, %esi
+; X86-BMI1-NEXT:    andl %edx, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    andl %ebp, %esi
-; X86-BMI1-NEXT:    andl %ebx, %edi
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl (%eax), %esi
-; X86-BMI2-NEXT:    movl 4(%eax), %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB43_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI2-NEXT:  .LBB43_2:
+; X86-BMI2-NEXT:    subl $16, %esp
 ; X86-BMI2-NEXT:    movb $64, %al
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB43_4
+; X86-BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI2-NEXT:    jne .LBB43_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl $-1, %esi
+; X86-BMI2-NEXT:  .LBB43_2:
+; X86-BMI2-NEXT:    movl (%edx), %eax
+; X86-BMI2-NEXT:    movl 4(%edx), %ebx
+; X86-BMI2-NEXT:    movl %esi, (%esp)
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB43_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI2-NEXT:    movl %ecx, %edi
 ; X86-BMI2-NEXT:  .LBB43_4:
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %ebp
+; X86-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %ecx, %ebx, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB43_6
+; X86-BMI2-NEXT:  # %bb.5:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB43_6:
+; X86-BMI2-NEXT:    andl %eax, %esi
+; X86-BMI2-NEXT:    andl %edx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    movl %edi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c2_load:
@@ -5451,141 +5316,142 @@ define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB44_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB44_2:
+; X86-NOBMI-NEXT:    subl $16, %esp
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %ebp
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    movl $-1, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB44_4
+; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    jne .LBB44_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:  .LBB44_2:
+; X86-NOBMI-NEXT:    movl (%edi), %eax
+; X86-NOBMI-NEXT:    movl 4(%edi), %ebx
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    movl $0, %edi
+; X86-NOBMI-NEXT:    jne .LBB44_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %edx, %edi
 ; X86-NOBMI-NEXT:  .LBB44_4:
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB44_6
+; X86-NOBMI-NEXT:  # %bb.5:
+; X86-NOBMI-NEXT:    movl %edx, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:  .LBB44_6:
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    andl %edx, %edi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    andl %ebp, %esi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_c3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %esi
-; X86-BMI1-NEXT:    movl 4(%eax), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB44_2
-; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:  .LBB44_2:
+; X86-BMI1-NEXT:    subl $16, %esp
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    movl $-1, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB44_4
+; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    jne .LBB44_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:  .LBB44_2:
+; X86-BMI1-NEXT:    movl (%edi), %eax
+; X86-BMI1-NEXT:    movl 4(%edi), %ebx
+; X86-BMI1-NEXT:    movl %esi, (%esp)
+; X86-BMI1-NEXT:    movl $0, %edi
+; X86-BMI1-NEXT:    jne .LBB44_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %ebp
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edx, %edi
 ; X86-BMI1-NEXT:  .LBB44_4:
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl %ebx
-; X86-BMI1-NEXT:    pushl %ebp
+; X86-BMI1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB44_6
+; X86-BMI1-NEXT:  # %bb.5:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB44_6:
+; X86-BMI1-NEXT:    andl %eax, %esi
+; X86-BMI1-NEXT:    andl %edx, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    andl %ebp, %esi
-; X86-BMI1-NEXT:    andl %ebx, %edi
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl (%eax), %esi
-; X86-BMI2-NEXT:    movl 4(%eax), %eax
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB44_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI2-NEXT:  .LBB44_2:
+; X86-BMI2-NEXT:    subl $16, %esp
 ; X86-BMI2-NEXT:    movb $64, %al
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB44_4
+; X86-BMI2-NEXT:    movl %ecx, %esi
+; X86-BMI2-NEXT:    jne .LBB44_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl $-1, %esi
+; X86-BMI2-NEXT:  .LBB44_2:
+; X86-BMI2-NEXT:    movl (%edx), %eax
+; X86-BMI2-NEXT:    movl 4(%edx), %ebx
+; X86-BMI2-NEXT:    movl %esi, (%esp)
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB44_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI2-NEXT:    movl %ecx, %edi
 ; X86-BMI2-NEXT:  .LBB44_4:
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %ebp
+; X86-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %ecx, %ebx, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB44_6
+; X86-BMI2-NEXT:  # %bb.5:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB44_6:
+; X86-BMI2-NEXT:    andl %eax, %esi
+; X86-BMI2-NEXT:    andl %edx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    movl %edi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c3_load_indexzext:
@@ -5662,138 +5528,137 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
+; X86-NOBMI-NEXT:    subl $16, %esp
+; X86-NOBMI-NEXT:    movb $64, %cl
+; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    jne .LBB45_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:  .LBB45_2:
+; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movl $0, %edx
+; X86-NOBMI-NEXT:    jne .LBB45_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %esi, %edx
+; X86-NOBMI-NEXT:  .LBB45_4:
+; X86-NOBMI-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB45_2
-; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    je .LBB45_6
+; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB45_2:
-; X86-NOBMI-NEXT:    movb $64, %cl
-; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %ebp
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB45_4
-; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB45_4:
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl %ebx
-; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:  .LBB45_6:
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    andl %edx, %edi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    andl %ebp, %esi
-; X86-NOBMI-NEXT:    andl %ebx, %edi
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_c4_commutative:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
+; X86-BMI1-NEXT:    subl $16, %esp
+; X86-BMI1-NEXT:    movb $64, %cl
+; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    shrl %cl, %esi
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    jne .LBB45_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:  .LBB45_2:
+; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movl $0, %edx
+; X86-BMI1-NEXT:    jne .LBB45_4
+; X86-BMI1-NEXT:  # %bb.3:
+; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:  .LBB45_4:
+; X86-BMI1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edi
 ; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB45_2
-; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    je .LBB45_6
+; X86-BMI1-NEXT:  # %bb.5:
 ; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:  .LBB45_2:
-; X86-BMI1-NEXT:    movb $64, %cl
-; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
-; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB45_4
-; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebx, %ebp
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
-; X86-BMI1-NEXT:  .LBB45_4:
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl %ebx
-; X86-BMI1-NEXT:    pushl %ebp
+; X86-BMI1-NEXT:  .LBB45_6:
+; X86-BMI1-NEXT:    andl %eax, %esi
+; X86-BMI1-NEXT:    andl %edx, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    andl %ebp, %esi
-; X86-BMI1-NEXT:    andl %ebx, %edi
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_c4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB45_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI2-NEXT:  .LBB45_2:
+; X86-BMI2-NEXT:    subl $20, %esp
 ; X86-BMI2-NEXT:    movb $64, %al
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
 ; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB45_4
+; X86-BMI2-NEXT:    movl %ecx, %eax
+; X86-BMI2-NEXT:    jne .LBB45_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:  .LBB45_2:
+; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movl $0, %edx
+; X86-BMI2-NEXT:    jne .LBB45_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI2-NEXT:    movl %ecx, %edx
 ; X86-BMI2-NEXT:  .LBB45_4:
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %ebp
+; X86-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %edi
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB45_6
+; X86-BMI2-NEXT:  # %bb.5:
+; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:  .LBB45_6:
+; X86-BMI2-NEXT:    andl %eax, %esi
+; X86-BMI2-NEXT:    andl %edx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    andl %ebx, %edi
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    movl %edi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $20, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
-; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c4_commutative:
@@ -5866,153 +5731,146 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
-; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB46_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB46_2:
+; X86-NOBMI-NEXT:    subl $16, %esp
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    movl $-1, %ebp
-; X86-NOBMI-NEXT:    shrl %cl, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB46_4
+; X86-NOBMI-NEXT:    movl $0, %edi
+; X86-NOBMI-NEXT:    jne .LBB46_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:  .LBB46_2:
+; X86-NOBMI-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    jne .LBB46_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %ebp, %ebx
-; X86-NOBMI-NEXT:    xorl %ebp, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:  .LBB46_4:
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl %ebp
-; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    andl %ebx, %esi
-; X86-NOBMI-NEXT:    andl %ebp, %edi
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, (%esp)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB46_6
+; X86-NOBMI-NEXT:  # %bb.5:
+; X86-NOBMI-NEXT:    movl %edx, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:  .LBB46_6:
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    andl %edx, %edi
 ; X86-NOBMI-NEXT:    calll use64 at PLT
-; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_c5_skipextrauses:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    shrl %cl, %edi
-; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB46_2
-; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edi, %esi
-; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:  .LBB46_2:
+; X86-BMI1-NEXT:    subl $16, %esp
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    shrl %cl, %ebp
+; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    shrl %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB46_4
+; X86-BMI1-NEXT:    movl $0, %edi
+; X86-BMI1-NEXT:    jne .LBB46_2
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:  .LBB46_2:
+; X86-BMI1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    jne .LBB46_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebp, %ebx
-; X86-BMI1-NEXT:    xorl %ebp, %ebp
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:  .LBB46_4:
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl %ebp
-; X86-BMI1-NEXT:    pushl %ebx
+; X86-BMI1-NEXT:    movl %esi, (%esp)
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    andl %ebx, %esi
-; X86-BMI1-NEXT:    andl %ebp, %edi
-; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-BMI1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %ecx, (%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-BMI1-NEXT:    testb $32, %cl
+; X86-BMI1-NEXT:    je .LBB46_6
+; X86-BMI1-NEXT:  # %bb.5:
+; X86-BMI1-NEXT:    movl %edx, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:  .LBB46_6:
+; X86-BMI1-NEXT:    andl %eax, %esi
+; X86-BMI1-NEXT:    andl %edx, %edi
 ; X86-BMI1-NEXT:    calll use64 at PLT
-; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    addl $16, %esp
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_c5_skipextrauses:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebp
-; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    subl $12, %esp
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %edi
-; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB46_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI2-NEXT:  .LBB46_2:
+; X86-BMI2-NEXT:    subl $20, %esp
 ; X86-BMI2-NEXT:    movb $64, %al
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-BMI2-NEXT:    movl $-1, %ebp
-; X86-BMI2-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %esi
 ; X86-BMI2-NEXT:    testb $32, %al
-; X86-BMI2-NEXT:    je .LBB46_4
+; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB46_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %esi, %edi
+; X86-BMI2-NEXT:  .LBB46_2:
+; X86-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    jne .LBB46_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %ebx, %ebp
-; X86-BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI2-NEXT:    movl $-1, %esi
 ; X86-BMI2-NEXT:  .LBB46_4:
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    pushl %ebp
+; X86-BMI2-NEXT:    movl %esi, (%esp)
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
-; X86-BMI2-NEXT:    andl %ebp, %esi
-; X86-BMI2-NEXT:    andl %ebx, %edi
-; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    je .LBB46_6
+; X86-BMI2-NEXT:  # %bb.5:
+; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:  .LBB46_6:
+; X86-BMI2-NEXT:    andl %eax, %esi
+; X86-BMI2-NEXT:    andl %edx, %edi
 ; X86-BMI2-NEXT:    calll use64 at PLT
-; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    movl %edi, %edx
-; X86-BMI2-NEXT:    addl $12, %esp
+; X86-BMI2-NEXT:    addl $20, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
-; X86-BMI2-NEXT:    popl %ebx
-; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c5_skipextrauses:
@@ -6232,41 +6090,37 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1-LABEL: bextr64_32_c1:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB48_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB48_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_c1:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB48_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB48_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_c1:
@@ -6331,41 +6185,37 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1-LABEL: bextr64_32_c2:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB49_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB49_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_c2:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB49_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB49_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_c2:
@@ -6665,9 +6515,9 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_d2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
@@ -6678,20 +6528,20 @@ define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1-LABEL: bextr32_d2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, (%eax), %eax
+; X86-BMI1-NEXT:    orl %eax, %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_d2_load:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
 ; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -6731,9 +6581,9 @@ define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_d3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
@@ -6744,20 +6594,20 @@ define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ;
 ; X86-BMI1-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, (%eax), %eax
+; X86-BMI1-NEXT:    orl %eax, %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
 ; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    retl
 ;
@@ -6801,16 +6651,16 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %ecx, (%esp)
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %eax, (%esp)
 ; X86-NOBMI-NEXT:    calll use32 at PLT
 ; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:    addl $8, %esp
@@ -6821,13 +6671,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1:       # %bb.0:
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    movzbl %al, %edx
-; X86-BMI1-NEXT:    orl %ecx, %edx
-; X86-BMI1-NEXT:    bextrl %edx, {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movl %eax, (%esp)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    movzbl %al, %eax
+; X86-BMI1-NEXT:    orl %ecx, %eax
+; X86-BMI1-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    calll use32 at PLT
 ; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:    addl $8, %esp
@@ -6838,11 +6688,11 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    subl $8, %esp
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %esi
-; X86-BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %esi
 ; X86-BMI2-NEXT:    calll use32 at PLT
 ; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:    addl $8, %esp
@@ -6901,42 +6751,39 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_d0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
-; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB56_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB56_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB56_4
+; X86-NOBMI-NEXT:    je .LBB56_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB56_4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB56_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB56_6:
-; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB56_8
 ; X86-NOBMI-NEXT:  # %bb.7:
@@ -6944,47 +6791,43 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:  .LBB56_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_d0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    shrdl %cl, %edx, %edi
-; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB56_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    xorl %eax, %eax
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB56_2:
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    shldl %cl, %edi, %eax
-; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %edi, %ebx
-; X86-BMI1-NEXT:    jne .LBB56_4
+; X86-BMI1-NEXT:    je .LBB56_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %eax, %ebx
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB56_4:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB56_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB56_6:
-; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB56_8
 ; X86-BMI1-NEXT:  # %bb.7:
@@ -6992,11 +6835,11 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1-NEXT:  .LBB56_8:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_d0:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
@@ -7014,26 +6857,29 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB56_4
+; X86-BMI2-NEXT:    movl %ebx, %edi
+; X86-BMI2-NEXT:    jne .LBB56_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    movl %esi, %edi
 ; X86-BMI2-NEXT:  .LBB56_4:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-BMI2-NEXT:    movl $0, %esi
 ; X86-BMI2-NEXT:    jne .LBB56_6
 ; X86-BMI2-NEXT:  # %bb.5:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %ebx, %esi
 ; X86-BMI2-NEXT:  .LBB56_6:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB56_8
 ; X86-BMI2-NEXT:  # %bb.7:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:  .LBB56_8:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_d0:
@@ -7071,42 +6917,39 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_d1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
-; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB57_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB57_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB57_4
+; X86-NOBMI-NEXT:    je .LBB57_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB57_4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB57_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB57_6:
-; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB57_8
 ; X86-NOBMI-NEXT:  # %bb.7:
@@ -7114,47 +6957,43 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:  .LBB57_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_d1_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    shrdl %cl, %edx, %edi
-; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB57_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    xorl %eax, %eax
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB57_2:
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    shldl %cl, %edi, %eax
-; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %edi, %ebx
-; X86-BMI1-NEXT:    jne .LBB57_4
+; X86-BMI1-NEXT:    je .LBB57_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %eax, %ebx
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB57_4:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB57_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB57_6:
-; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB57_8
 ; X86-BMI1-NEXT:  # %bb.7:
@@ -7162,11 +7001,11 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1-NEXT:  .LBB57_8:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_d1_indexzext:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
@@ -7184,26 +7023,29 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    shlxl %ecx, %eax, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB57_4
+; X86-BMI2-NEXT:    movl %ebx, %edi
+; X86-BMI2-NEXT:    jne .LBB57_4
 ; X86-BMI2-NEXT:  # %bb.3:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    movl %esi, %edi
 ; X86-BMI2-NEXT:  .LBB57_4:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-BMI2-NEXT:    movl $0, %esi
 ; X86-BMI2-NEXT:    jne .LBB57_6
 ; X86-BMI2-NEXT:  # %bb.5:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %ebx, %esi
 ; X86-BMI2-NEXT:  .LBB57_6:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB57_8
 ; X86-BMI2-NEXT:  # %bb.7:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI2-NEXT:  .LBB57_8:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_d1_indexzext:
@@ -7245,43 +7087,40 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_d2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %edi
-; X86-NOBMI-NEXT:    movl 4(%eax), %edx
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
-; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movl 4(%eax), %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB58_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB58_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB58_4
+; X86-NOBMI-NEXT:    je .LBB58_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB58_4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB58_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB58_6:
-; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB58_8
 ; X86-NOBMI-NEXT:  # %bb.7:
@@ -7289,48 +7128,44 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:  .LBB58_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_d2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %edi
-; X86-BMI1-NEXT:    movl 4(%eax), %edx
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    shrdl %cl, %edx, %edi
-; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    movl (%eax), %esi
+; X86-BMI1-NEXT:    movl 4(%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB58_2
-; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    xorl %eax, %eax
+; X86-BMI1-NEXT:  # %bb.1:
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB58_2:
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    shldl %cl, %edi, %eax
-; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %edi, %ebx
-; X86-BMI1-NEXT:    jne .LBB58_4
+; X86-BMI1-NEXT:    je .LBB58_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %eax, %ebx
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB58_4:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB58_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB58_6:
-; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB58_8
 ; X86-BMI1-NEXT:  # %bb.7:
@@ -7338,49 +7173,52 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1-NEXT:  .LBB58_8:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_d2_load:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl (%edx), %eax
-; X86-BMI2-NEXT:    movl 4(%edx), %edx
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %esi
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl (%eax), %esi
+; X86-BMI2-NEXT:    movl 4(%eax), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB58_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %esi, %eax
-; X86-BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI2-NEXT:    movl %eax, %esi
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB58_2:
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    shldl %cl, %esi, %eax
+; X86-BMI2-NEXT:    shlxl %ecx, %esi, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB58_4
-; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB58_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl %eax, %esi
 ; X86-BMI2-NEXT:  .LBB58_4:
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    movl $0, %ebx
 ; X86-BMI2-NEXT:    jne .LBB58_6
 ; X86-BMI2-NEXT:  # %bb.5:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %edi, %ebx
 ; X86-BMI2-NEXT:  .LBB58_6:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB58_8
 ; X86-BMI2-NEXT:  # %bb.7:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI2-NEXT:  .LBB58_8:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_d2_load:
@@ -7419,43 +7257,40 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_d3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %edi
-; X86-NOBMI-NEXT:    movl 4(%eax), %edx
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
-; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movl 4(%eax), %eax
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB59_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB59_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
-; X86-NOBMI-NEXT:    shll %cl, %edi
+; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
+; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB59_4
+; X86-NOBMI-NEXT:    je .LBB59_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %eax, %ebx
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB59_4:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB59_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB59_6:
-; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB59_8
 ; X86-NOBMI-NEXT:  # %bb.7:
@@ -7463,48 +7298,44 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ; X86-NOBMI-NEXT:  .LBB59_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_d3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %edi
-; X86-BMI1-NEXT:    movl 4(%eax), %edx
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    shrdl %cl, %edx, %edi
-; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    movl (%eax), %esi
+; X86-BMI1-NEXT:    movl 4(%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    shrl %cl, %edi
+; X86-BMI1-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB59_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
-; X86-BMI1-NEXT:    xorl %eax, %eax
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB59_2:
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    shldl %cl, %edi, %eax
-; X86-BMI1-NEXT:    shll %cl, %edi
+; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
+; X86-BMI1-NEXT:    shll %cl, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %edi, %ebx
-; X86-BMI1-NEXT:    jne .LBB59_4
+; X86-BMI1-NEXT:    je .LBB59_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %eax, %ebx
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB59_4:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB59_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB59_6:
-; X86-BMI1-NEXT:    shrdl %cl, %ebx, %esi
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB59_8
 ; X86-BMI1-NEXT:  # %bb.7:
@@ -7512,49 +7343,52 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ; X86-BMI1-NEXT:  .LBB59_8:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_d3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl (%edx), %eax
-; X86-BMI2-NEXT:    movl 4(%edx), %edx
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %esi
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl (%eax), %esi
+; X86-BMI2-NEXT:    movl 4(%eax), %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB59_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %esi, %eax
-; X86-BMI2-NEXT:    xorl %esi, %esi
+; X86-BMI2-NEXT:    movl %eax, %esi
+; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB59_2:
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    shldl %cl, %esi, %eax
+; X86-BMI2-NEXT:    shlxl %ecx, %esi, %edi
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB59_4
-; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB59_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl %eax, %esi
 ; X86-BMI2-NEXT:  .LBB59_4:
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    movl $0, %ebx
 ; X86-BMI2-NEXT:    jne .LBB59_6
 ; X86-BMI2-NEXT:  # %bb.5:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %edi, %ebx
 ; X86-BMI2-NEXT:  .LBB59_6:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB59_8
 ; X86-BMI2-NEXT:  # %bb.7:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI2-NEXT:  .LBB59_8:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_d3_load_indexzext:
@@ -7597,176 +7431,170 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_d5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebp
 ; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %al
 ; X86-NOBMI-NEXT:    je .LBB60_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %esi, %ebx
-; X86-NOBMI-NEXT:    xorl %esi, %esi
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB60_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shldl %cl, %ebx, %esi
-; X86-NOBMI-NEXT:    shll %cl, %ebx
+; X86-NOBMI-NEXT:    shldl %cl, %edx, %ebx
+; X86-NOBMI-NEXT:    shll %cl, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    jne .LBB60_4
+; X86-NOBMI-NEXT:    je .LBB60_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ebp
+; X86-NOBMI-NEXT:    movl %edx, %ebx
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:  .LBB60_4:
-; X86-NOBMI-NEXT:    movl %ebp, %esi
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %ebx, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edi
 ; X86-NOBMI-NEXT:    jne .LBB60_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:  .LBB60_6:
-; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    jne .LBB60_8
 ; X86-NOBMI-NEXT:  # %bb.7:
-; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    movl %edx, %edi
 ; X86-NOBMI-NEXT:  .LBB60_8:
 ; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    pushl %ecx
 ; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64 at PLT
 ; X86-NOBMI-NEXT:    addl $16, %esp
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:    addl $12, %esp
+; X86-NOBMI-NEXT:    movl %edi, %eax
+; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
-; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bextr64_d5_skipextrauses:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebp
 ; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl %esi, %ebx
 ; X86-BMI1-NEXT:    movl %eax, %ecx
-; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %edx, %ebx
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:    testb $32, %al
 ; X86-BMI1-NEXT:    je .LBB60_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %ebx
-; X86-BMI1-NEXT:    xorl %esi, %esi
+; X86-BMI1-NEXT:    movl %ebx, %edx
+; X86-BMI1-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1-NEXT:  .LBB60_2:
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    shldl %cl, %ebx, %esi
-; X86-BMI1-NEXT:    shll %cl, %ebx
+; X86-BMI1-NEXT:    shldl %cl, %edx, %ebx
+; X86-BMI1-NEXT:    shll %cl, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %ebx, %ebp
-; X86-BMI1-NEXT:    jne .LBB60_4
+; X86-BMI1-NEXT:    je .LBB60_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ebp
+; X86-BMI1-NEXT:    movl %edx, %ebx
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:  .LBB60_4:
-; X86-BMI1-NEXT:    movl %ebp, %esi
-; X86-BMI1-NEXT:    shrl %cl, %esi
+; X86-BMI1-NEXT:    movl %ebx, %edi
+; X86-BMI1-NEXT:    shrl %cl, %edi
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edi
 ; X86-BMI1-NEXT:    jne .LBB60_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %ebx, %edx
-; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    movl %edi, %esi
 ; X86-BMI1-NEXT:  .LBB60_6:
-; X86-BMI1-NEXT:    shrdl %cl, %ebp, %edx
+; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    jne .LBB60_8
 ; X86-BMI1-NEXT:  # %bb.7:
-; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movl %edx, %edi
 ; X86-BMI1-NEXT:  .LBB60_8:
 ; X86-BMI1-NEXT:    subl $8, %esp
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    pushl %ecx
 ; X86-BMI1-NEXT:    pushl %eax
 ; X86-BMI1-NEXT:    calll use64 at PLT
 ; X86-BMI1-NEXT:    addl $16, %esp
-; X86-BMI1-NEXT:    movl %esi, %eax
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    addl $12, %esp
+; X86-BMI1-NEXT:    movl %edi, %eax
+; X86-BMI1-NEXT:    movl %esi, %edx
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    popl %ebx
-; X86-BMI1-NEXT:    popl %ebp
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_d5_skipextrauses:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebp
 ; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    subl $12, %esp
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %edi
-; X86-BMI2-NEXT:    shrxl %eax, %edx, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %eax, %esi, %edi
 ; X86-BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI2-NEXT:    testb $32, %al
 ; X86-BMI2-NEXT:    je .LBB60_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %edi
-; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    movl %edi, %edx
+; X86-BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI2-NEXT:  .LBB60_2:
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %edi, %edx
-; X86-BMI2-NEXT:    shlxl %ecx, %edi, %ebx
+; X86-BMI2-NEXT:    shldl %cl, %edx, %edi
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB60_4
-; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %ebx, %edx
-; X86-BMI2-NEXT:    movl $0, %ebx
+; X86-BMI2-NEXT:    jne .LBB60_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI2-NEXT:  .LBB60_4:
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edi
+; X86-BMI2-NEXT:    movl $0, %ebp
 ; X86-BMI2-NEXT:    jne .LBB60_6
 ; X86-BMI2-NEXT:  # %bb.5:
 ; X86-BMI2-NEXT:    movl %edi, %esi
+; X86-BMI2-NEXT:    movl %ebx, %ebp
 ; X86-BMI2-NEXT:  .LBB60_6:
-; X86-BMI2-NEXT:    shrdl %cl, %edx, %ebx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %ebp
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    jne .LBB60_8
 ; X86-BMI2-NEXT:  # %bb.7:
-; X86-BMI2-NEXT:    movl %ebx, %edi
+; X86-BMI2-NEXT:    movl %ebp, %edi
 ; X86-BMI2-NEXT:  .LBB60_8:
 ; X86-BMI2-NEXT:    subl $8, %esp
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    pushl %ecx
 ; X86-BMI2-NEXT:    pushl %eax
 ; X86-BMI2-NEXT:    calll use64 at PLT
 ; X86-BMI2-NEXT:    addl $16, %esp
 ; X86-BMI2-NEXT:    movl %edi, %eax
 ; X86-BMI2-NEXT:    movl %esi, %edx
+; X86-BMI2-NEXT:    addl $12, %esp
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    popl %ebp
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_d5_skipextrauses:
@@ -7983,41 +7811,37 @@ define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1-LABEL: bextr64_32_d1:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl %edi, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
+; X86-BMI1-NEXT:    movl %esi, %eax
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    shrdl %cl, %esi, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB62_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %esi, %edx
+; X86-BMI1-NEXT:    movl %edx, %eax
 ; X86-BMI1-NEXT:  .LBB62_2:
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    popl %esi
-; X86-BMI1-NEXT:    popl %edi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bextr64_32_d1:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    je .LBB62_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    shrxl %ecx, %esi, %edx
+; X86-BMI2-NEXT:    shrxl %ecx, %edx, %eax
 ; X86-BMI2-NEXT:  .LBB62_2:
-; X86-BMI2-NEXT:    bzhil %eax, %edx, %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_32_d1:
@@ -8064,28 +7888,28 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; X86-NOBMI-LABEL: pr38938:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl (%eax), %eax
+; X86-NOBMI-NEXT:    shrl $19, %eax
+; X86-NOBMI-NEXT:    andl $4092, %eax # imm = 0xFFC
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl (%ecx), %ecx
-; X86-NOBMI-NEXT:    shrl $19, %ecx
-; X86-NOBMI-NEXT:    andl $4092, %ecx # imm = 0xFFC
-; X86-NOBMI-NEXT:    incl (%eax,%ecx)
+; X86-NOBMI-NEXT:    incl (%ecx,%eax)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMINOTBM-LABEL: pr38938:
 ; X86-BMINOTBM:       # %bb.0:
-; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT:    movl $2581, %eax # imm = 0xA15
+; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMINOTBM-NEXT:    movl $2581, %edx # imm = 0xA15
-; X86-BMINOTBM-NEXT:    bextrl %edx, (%ecx), %ecx
-; X86-BMINOTBM-NEXT:    incl (%eax,%ecx,4)
+; X86-BMINOTBM-NEXT:    incl (%ecx,%eax,4)
 ; X86-BMINOTBM-NEXT:    retl
 ;
 ; X86-BMITBM-LABEL: pr38938:
 ; X86-BMITBM:       # %bb.0:
 ; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMITBM-NEXT:    bextrl $2581, (%eax), %eax # imm = 0xA15
 ; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMITBM-NEXT:    bextrl $2581, (%ecx), %ecx # imm = 0xA15
-; X86-BMITBM-NEXT:    incl (%eax,%ecx,4)
+; X86-BMITBM-NEXT:    incl (%ecx,%eax,4)
 ; X86-BMITBM-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: pr38938:
@@ -8343,25 +8167,25 @@ define void @c5_i32(i32 %arg, ptr %ptr) nounwind {
 ; X86-NOBMI-LABEL: c5_i32:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    shrl $19, %eax
+; X86-NOBMI-NEXT:    andl $1023, %eax # imm = 0x3FF
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shrl $19, %ecx
-; X86-NOBMI-NEXT:    andl $1023, %ecx # imm = 0x3FF
-; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMINOTBM-LABEL: c5_i32:
 ; X86-BMINOTBM:       # %bb.0:
-; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMINOTBM-NEXT:    movl $2579, %ecx # imm = 0xA13
-; X86-BMINOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMINOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMINOTBM-NEXT:    movl $2579, %eax # imm = 0xA13
+; X86-BMINOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT:    movl %eax, (%ecx)
 ; X86-BMINOTBM-NEXT:    retl
 ;
 ; X86-BMITBM-LABEL: c5_i32:
 ; X86-BMITBM:       # %bb.0:
-; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMITBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
-; X86-BMITBM-NEXT:    movl %ecx, (%eax)
+; X86-BMITBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %eax # imm = 0xA13
+; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMITBM-NEXT:    movl %eax, (%ecx)
 ; X86-BMITBM-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: c5_i32:
@@ -8394,25 +8218,25 @@ define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
 ; X86-NOBMI-LABEL: c6_i32:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    shrl $19, %eax
+; X86-NOBMI-NEXT:    andl $4095, %eax # imm = 0xFFF
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shrl $19, %ecx
-; X86-NOBMI-NEXT:    andl $4095, %ecx # imm = 0xFFF
-; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMINOTBM-LABEL: c6_i32:
 ; X86-BMINOTBM:       # %bb.0:
-; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMINOTBM-NEXT:    movl $3091, %ecx # imm = 0xC13
-; X86-BMINOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMINOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMINOTBM-NEXT:    movl $3091, %eax # imm = 0xC13
+; X86-BMINOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT:    movl %eax, (%ecx)
 ; X86-BMINOTBM-NEXT:    retl
 ;
 ; X86-BMITBM-LABEL: c6_i32:
 ; X86-BMITBM:       # %bb.0:
-; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMITBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13
-; X86-BMITBM-NEXT:    movl %ecx, (%eax)
+; X86-BMITBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %eax # imm = 0xC13
+; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMITBM-NEXT:    movl %eax, (%ecx)
 ; X86-BMITBM-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: c6_i32:
@@ -8445,10 +8269,10 @@ define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
 ; X86-LABEL: c7_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $17, %eax
+; X86-NEXT:    andl $4092, %eax # imm = 0xFFC
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $17, %ecx
-; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: c7_i32:
@@ -8471,28 +8295,28 @@ define void @c5_i64(i64 %arg, ptr %ptr) nounwind {
 ; X86-NOBMI-LABEL: c5_i64:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    shrl $19, %eax
+; X86-NOBMI-NEXT:    andl $1023, %eax # imm = 0x3FF
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shrl $19, %ecx
-; X86-NOBMI-NEXT:    andl $1023, %ecx # imm = 0x3FF
-; X86-NOBMI-NEXT:    movl %ecx, (%eax)
-; X86-NOBMI-NEXT:    movl $0, 4(%eax)
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
+; X86-NOBMI-NEXT:    movl $0, 4(%ecx)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMINOTBM-LABEL: c5_i64:
 ; X86-BMINOTBM:       # %bb.0:
-; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMINOTBM-NEXT:    movl $2579, %ecx # imm = 0xA13
-; X86-BMINOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMINOTBM-NEXT:    movl %ecx, (%eax)
-; X86-BMINOTBM-NEXT:    movl $0, 4(%eax)
+; X86-BMINOTBM-NEXT:    movl $2579, %eax # imm = 0xA13
+; X86-BMINOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT:    movl %eax, (%ecx)
+; X86-BMINOTBM-NEXT:    movl $0, 4(%ecx)
 ; X86-BMINOTBM-NEXT:    retl
 ;
 ; X86-BMITBM-LABEL: c5_i64:
 ; X86-BMITBM:       # %bb.0:
-; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMITBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
-; X86-BMITBM-NEXT:    movl %ecx, (%eax)
-; X86-BMITBM-NEXT:    movl $0, 4(%eax)
+; X86-BMITBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %eax # imm = 0xA13
+; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMITBM-NEXT:    movl %eax, (%ecx)
+; X86-BMITBM-NEXT:    movl $0, 4(%ecx)
 ; X86-BMITBM-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: c5_i64:
@@ -8525,28 +8349,28 @@ define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
 ; X86-NOBMI-LABEL: c6_i64:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    shrl $19, %eax
+; X86-NOBMI-NEXT:    andl $4095, %eax # imm = 0xFFF
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shrl $19, %ecx
-; X86-NOBMI-NEXT:    andl $4095, %ecx # imm = 0xFFF
-; X86-NOBMI-NEXT:    movl %ecx, (%eax)
-; X86-NOBMI-NEXT:    movl $0, 4(%eax)
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
+; X86-NOBMI-NEXT:    movl $0, 4(%ecx)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMINOTBM-LABEL: c6_i64:
 ; X86-BMINOTBM:       # %bb.0:
-; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMINOTBM-NEXT:    movl $3091, %ecx # imm = 0xC13
-; X86-BMINOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMINOTBM-NEXT:    movl %ecx, (%eax)
-; X86-BMINOTBM-NEXT:    movl $0, 4(%eax)
+; X86-BMINOTBM-NEXT:    movl $3091, %eax # imm = 0xC13
+; X86-BMINOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMINOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMINOTBM-NEXT:    movl %eax, (%ecx)
+; X86-BMINOTBM-NEXT:    movl $0, 4(%ecx)
 ; X86-BMINOTBM-NEXT:    retl
 ;
 ; X86-BMITBM-LABEL: c6_i64:
 ; X86-BMITBM:       # %bb.0:
-; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMITBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13
-; X86-BMITBM-NEXT:    movl %ecx, (%eax)
-; X86-BMITBM-NEXT:    movl $0, 4(%eax)
+; X86-BMITBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %eax # imm = 0xC13
+; X86-BMITBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMITBM-NEXT:    movl %eax, (%ecx)
+; X86-BMITBM-NEXT:    movl $0, 4(%ecx)
 ; X86-BMITBM-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: c6_i64:
@@ -8579,11 +8403,11 @@ define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
 ; X86-LABEL: c7_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $17, %eax
+; X86-NEXT:    andl $4092, %eax # imm = 0xFFC
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $17, %ecx
-; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: c7_i64:
@@ -8602,11 +8426,11 @@ define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
 define i64 @c8_i64(i64 %arg, ptr %ptr) nounwind {
 ; X86-LABEL: c8_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    shrl $19, %eax
 ; X86-NEXT:    andl $4092, %eax # imm = 0xFFC
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll
index aff5ac7437a29..2865c0b94770a 100644
--- a/llvm/test/CodeGen/X86/extract-lowbits.ll
+++ b/llvm/test/CodeGen/X86/extract-lowbits.ll
@@ -129,27 +129,27 @@ define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 define i32 @bzhi32_a2_load(ptr %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_a2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl (%edx), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_a2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_a2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_a2_load:
@@ -182,27 +182,27 @@ define i32 @bzhi32_a2_load(ptr %w, i32 %numlowbits) nounwind {
 define i32 @bzhi32_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_a3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
-; X86-NOBMI-NEXT:    andl (%edx), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_a3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_a3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_a3_load_indexzext:
@@ -546,8 +546,6 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_a2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
@@ -561,15 +559,13 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:  .LBB8_2:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl 4(%ecx), %edx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_a2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl $1, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
@@ -583,15 +579,13 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-BMI1-NEXT:  .LBB8_2:
 ; X86-BMI1-NEXT:    addl $-1, %eax
 ; X86-BMI1-NEXT:    adcl $-1, %edx
-; X86-BMI1-NEXT:    andl 4(%esi), %edx
-; X86-BMI1-NEXT:    andl (%esi), %eax
-; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    andl 4(%ecx), %edx
+; X86-BMI1-NEXT:    andl (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_a2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $1, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
@@ -605,9 +599,9 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-BMI2-NEXT:  .LBB8_2:
 ; X86-BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI2-NEXT:    adcl $-1, %edx
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
+; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_a2_load:
@@ -640,8 +634,6 @@ define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_a3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
@@ -655,15 +647,13 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:  .LBB9_2:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl 4(%ecx), %edx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_a3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1-NEXT:    movl $1, %eax
 ; X86-BMI1-NEXT:    xorl %edx, %edx
@@ -677,15 +667,13 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-BMI1-NEXT:  .LBB9_2:
 ; X86-BMI1-NEXT:    addl $-1, %eax
 ; X86-BMI1-NEXT:    adcl $-1, %edx
-; X86-BMI1-NEXT:    andl 4(%esi), %edx
-; X86-BMI1-NEXT:    andl (%esi), %eax
-; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    andl 4(%ecx), %edx
+; X86-BMI1-NEXT:    andl (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_a3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl $1, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
@@ -699,9 +687,9 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-BMI2-NEXT:  .LBB9_2:
 ; X86-BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI2-NEXT:    adcl $-1, %edx
-; X86-BMI2-NEXT:    andl 4(%esi), %edx
-; X86-BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    andl 4(%ecx), %edx
+; X86-BMI2-NEXT:    andl (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_a3_load_indexzext:
@@ -950,10 +938,10 @@ define i32 @bzhi64_32_a1(i64 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi64_32_a1_trunc_extrause(i64 %val, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_32_a1_trunc_extrause:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %edx, (%eax)
+; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    decl %eax
@@ -962,21 +950,21 @@ define i32 @bzhi64_32_a1_trunc_extrause(i64 %val, i32 %numlowbits, ptr %escape)
 ;
 ; X86-BMI1-LABEL: bzhi64_32_a1_trunc_extrause:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl %ecx, (%edx)
-; X86-BMI1-NEXT:    shll $8, %eax
-; X86-BMI1-NEXT:    bextrl %eax, %ecx, %eax
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    shll $8, %ecx
+; X86-BMI1-NEXT:    bextrl %ecx, %eax, %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_32_a1_trunc_extrause:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl %ecx, (%edx)
-; X86-BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI2-NEXT:    movl %eax, (%ecx)
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_32_a1_trunc_extrause:
@@ -1244,27 +1232,27 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_b2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl (%edx), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_b2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_b2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_b2_load:
@@ -1297,27 +1285,27 @@ define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
 define i32 @bzhi32_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_b3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl (%edx), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_b3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_b3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_b3_load_indexzext:
@@ -1402,25 +1390,20 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB21_1
-; X86-NOBMI-NEXT:  # %bb.2:
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    jmp .LBB21_3
-; X86-NOBMI-NEXT:  .LBB21_1:
-; X86-NOBMI-NEXT:    movl %esi, %edx
-; X86-NOBMI-NEXT:  .LBB21_3:
-; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    je .LBB21_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB21_2:
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_b0:
@@ -1483,25 +1466,20 @@ define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
 define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB22_1
-; X86-NOBMI-NEXT:  # %bb.2:
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    jmp .LBB22_3
-; X86-NOBMI-NEXT:  .LBB22_1:
-; X86-NOBMI-NEXT:    movl %esi, %edx
-; X86-NOBMI-NEXT:  .LBB22_3:
-; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    je .LBB22_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB22_2:
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_b1_indexzext:
@@ -1567,65 +1545,54 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    shll %cl, %edi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB23_1
-; X86-NOBMI-NEXT:  # %bb.2:
-; X86-NOBMI-NEXT:    movl %edi, %eax
-; X86-NOBMI-NEXT:    jmp .LBB23_3
-; X86-NOBMI-NEXT:  .LBB23_1:
-; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:  .LBB23_3:
-; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    je .LBB23_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB23_2:
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl 4(%ecx), %edx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_b2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %edx
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB23_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:    xorl %eax, %eax
 ; X86-BMI1-NEXT:  .LBB23_2:
-; X86-BMI1-NEXT:    andnl (%edx), %eax, %eax
-; X86-BMI1-NEXT:    andnl 4(%edx), %esi, %edx
-; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    andnl (%ecx), %eax, %eax
+; X86-BMI1-NEXT:    andnl 4(%ecx), %edx, %edx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_b2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shlxl %edx, %ecx, %eax
+; X86-BMI2-NEXT:    testb $32, %dl
 ; X86-BMI2-NEXT:    je .LBB23_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB23_2:
-; X86-BMI2-NEXT:    andnl (%ecx), %eax, %eax
-; X86-BMI2-NEXT:    andnl 4(%ecx), %edx, %edx
-; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    andnl (%edx), %eax, %eax
+; X86-BMI2-NEXT:    andnl 4(%edx), %ecx, %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b2_load:
@@ -1658,65 +1625,54 @@ define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    shll %cl, %edi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB24_1
-; X86-NOBMI-NEXT:  # %bb.2:
-; X86-NOBMI-NEXT:    movl %edi, %eax
-; X86-NOBMI-NEXT:    jmp .LBB24_3
-; X86-NOBMI-NEXT:  .LBB24_1:
-; X86-NOBMI-NEXT:    movl %edi, %edx
-; X86-NOBMI-NEXT:  .LBB24_3:
-; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    je .LBB24_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB24_2:
 ; X86-NOBMI-NEXT:    notl %eax
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
-; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    andl (%ecx), %eax
+; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    andl 4(%ecx), %edx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_b3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %edx
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    shll %cl, %eax
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB24_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %esi
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:    xorl %eax, %eax
 ; X86-BMI1-NEXT:  .LBB24_2:
-; X86-BMI1-NEXT:    andnl (%edx), %eax, %eax
-; X86-BMI1-NEXT:    andnl 4(%edx), %esi, %edx
-; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    andnl (%ecx), %eax, %eax
+; X86-BMI1-NEXT:    andnl 4(%ecx), %edx, %edx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_b3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    shlxl %ebx, %edx, %eax
-; X86-BMI2-NEXT:    testb $32, %bl
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shlxl %edx, %ecx, %eax
+; X86-BMI2-NEXT:    testb $32, %dl
 ; X86-BMI2-NEXT:    je .LBB24_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI2-NEXT:    xorl %eax, %eax
 ; X86-BMI2-NEXT:  .LBB24_2:
-; X86-BMI2-NEXT:    andnl (%ecx), %eax, %eax
-; X86-BMI2-NEXT:    andnl 4(%ecx), %edx, %edx
-; X86-BMI2-NEXT:    popl %ebx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    andnl (%edx), %eax, %eax
+; X86-BMI2-NEXT:    andnl 4(%edx), %ecx, %edx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_b3_load_indexzext:
@@ -1752,25 +1708,20 @@ define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_b4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB25_1
-; X86-NOBMI-NEXT:  # %bb.2:
-; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    jmp .LBB25_3
-; X86-NOBMI-NEXT:  .LBB25_1:
-; X86-NOBMI-NEXT:    movl %esi, %edx
-; X86-NOBMI-NEXT:  .LBB25_3:
-; X86-NOBMI-NEXT:    notl %edx
+; X86-NOBMI-NEXT:    je .LBB25_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB25_2:
 ; X86-NOBMI-NEXT:    notl %eax
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_b4_commutative:
@@ -2093,36 +2044,36 @@ define i32 @bzhi64_32_b3(i64 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    movl %eax, (%edx)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_c0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    movl %eax, (%edx)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_c0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    bzhil %ecx, %edx, %edx
-; X86-BMI2-NEXT:    movl %edx, (%eax)
-; X86-BMI2-NEXT:    bzhil %ecx, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c0:
@@ -2164,36 +2115,36 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    movl %eax, (%edx)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    movl %eax, (%edx)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    bzhil %ecx, %edx, %edx
-; X86-BMI2-NEXT:    movl %edx, (%eax)
-; X86-BMI2-NEXT:    bzhil %ecx, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c1_indexzext:
@@ -2236,47 +2187,41 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr %escape) nounwind
 define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    movl %esi, (%edx)
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edx, (%ecx)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_c2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %edx
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1-NEXT:    shrl %cl, %esi
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl (%eax), %eax
-; X86-BMI1-NEXT:    andl %esi, %eax
-; X86-BMI1-NEXT:    movl %esi, (%edx)
-; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    andl %edx, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, (%ecx)
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_c2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    bzhil %edx, %esi, %esi
-; X86-BMI2-NEXT:    bzhil %edx, (%eax), %eax
-; X86-BMI2-NEXT:    movl %esi, (%ecx)
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    bzhil %ecx, %edx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c2_load:
@@ -2321,47 +2266,41 @@ define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits, ptr %escape) nounwind {
 define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    movl %esi, (%edx)
-; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    andl %edx, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edx, (%ecx)
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl $-1, %esi
+; X86-BMI1-NEXT:    movl $-1, %edx
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1-NEXT:    shrl %cl, %esi
+; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl (%eax), %eax
-; X86-BMI1-NEXT:    andl %esi, %eax
-; X86-BMI1-NEXT:    movl %esi, (%edx)
-; X86-BMI1-NEXT:    popl %esi
+; X86-BMI1-NEXT:    andl %edx, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, (%ecx)
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    bzhil %edx, %esi, %esi
-; X86-BMI2-NEXT:    bzhil %edx, (%eax), %eax
-; X86-BMI2-NEXT:    movl %esi, (%ecx)
-; X86-BMI2-NEXT:    popl %esi
+; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movl $-1, %edx
+; X86-BMI2-NEXT:    bzhil %ecx, %edx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c3_load_indexzext:
@@ -2407,36 +2346,36 @@ define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwi
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    movl %eax, (%edx)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $-1, %eax
 ; X86-BMI1-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    movl %eax, (%edx)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_c4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl $-1, %edx
-; X86-BMI2-NEXT:    bzhil %ecx, %edx, %edx
-; X86-BMI2-NEXT:    movl %edx, (%eax)
-; X86-BMI2-NEXT:    bzhil %ecx, {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    bzhil %eax, %ecx, %ecx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl %ecx, (%edx)
+; X86-BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c4_commutative:
@@ -2480,67 +2419,71 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, ptr %escape) nounwi
 define i64 @bzhi64_c0(i64 %val, i64 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB35_2
+; X86-NOBMI-NEXT:    jne .LBB35_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB35_2:
-; X86-NOBMI-NEXT:    movl %edx, 4(%esi)
-; X86-NOBMI-NEXT:    movl %eax, (%esi)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edx, 4(%ecx)
+; X86-NOBMI-NEXT:    jne .LBB35_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:  .LBB35_4:
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_c0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $-1, %eax
-; X86-BMI1-NEXT:    movl $-1, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB35_2
+; X86-BMI1-NEXT:    jne .LBB35_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB35_2:
-; X86-BMI1-NEXT:    movl %edx, 4(%esi)
-; X86-BMI1-NEXT:    movl %eax, (%esi)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI1-NEXT:    jne .LBB35_4
+; X86-BMI1-NEXT:  # %bb.3:
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:  .LBB35_4:
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_c0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movb $64, %bl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ebx, %eax, %edx
-; X86-BMI2-NEXT:    testb $32, %bl
-; X86-BMI2-NEXT:    je .LBB35_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    jne .LBB35_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:  .LBB35_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI2-NEXT:    jne .LBB35_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:  .LBB35_4:
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c0:
@@ -2582,67 +2525,71 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits, ptr %escape) nounwind {
 define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB36_2
+; X86-NOBMI-NEXT:    jne .LBB36_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB36_2:
-; X86-NOBMI-NEXT:    movl %edx, 4(%esi)
-; X86-NOBMI-NEXT:    movl %eax, (%esi)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edx, 4(%ecx)
+; X86-NOBMI-NEXT:    jne .LBB36_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:  .LBB36_4:
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_c1_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $-1, %eax
-; X86-BMI1-NEXT:    movl $-1, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB36_2
+; X86-BMI1-NEXT:    jne .LBB36_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB36_2:
-; X86-BMI1-NEXT:    movl %edx, 4(%esi)
-; X86-BMI1-NEXT:    movl %eax, (%esi)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI1-NEXT:    jne .LBB36_4
+; X86-BMI1-NEXT:  # %bb.3:
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:  .LBB36_4:
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_c1_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movb $64, %bl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ebx, %eax, %edx
-; X86-BMI2-NEXT:    testb $32, %bl
-; X86-BMI2-NEXT:    je .LBB36_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    jne .LBB36_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:  .LBB36_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI2-NEXT:    jne .LBB36_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:  .LBB36_4:
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c1_indexzext:
@@ -2686,84 +2633,80 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits, ptr %escape) nounwind
 define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB37_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebx, %edi
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB37_2:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl 4(%eax), %edx
-; X86-NOBMI-NEXT:    andl %ebx, %edx
+; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    andl %edi, %eax
-; X86-NOBMI-NEXT:    movl %ebx, 4(%esi)
-; X86-NOBMI-NEXT:    movl %edi, (%esi)
+; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edi, 4(%ecx)
+; X86-NOBMI-NEXT:    movl %esi, (%ecx)
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_c2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    shrl %cl, %edi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB37_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB37_2:
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl 4(%eax), %edx
-; X86-BMI1-NEXT:    andl %ebx, %edx
+; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    movl (%eax), %eax
-; X86-BMI1-NEXT:    andl %edi, %eax
-; X86-BMI1-NEXT:    movl %ebx, 4(%esi)
-; X86-BMI1-NEXT:    movl %edi, (%esi)
+; X86-BMI1-NEXT:    andl %esi, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edi, 4(%ecx)
+; X86-BMI1-NEXT:    movl %esi, (%ecx)
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_c2_load:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movb $64, %dl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %dl
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shrxl %edx, %esi, %edi
-; X86-BMI2-NEXT:    testb $32, %dl
+; X86-BMI2-NEXT:    movb $64, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %esi
+; X86-BMI2-NEXT:    testb $32, %al
 ; X86-BMI2-NEXT:    je .LBB37_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl %esi, %ecx
+; X86-BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI2-NEXT:  .LBB37_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl 4(%eax), %edx
-; X86-BMI2-NEXT:    andl %edi, %edx
+; X86-BMI2-NEXT:    andl %esi, %edx
 ; X86-BMI2-NEXT:    movl (%eax), %eax
-; X86-BMI2-NEXT:    andl %esi, %eax
-; X86-BMI2-NEXT:    movl %edi, 4(%ecx)
-; X86-BMI2-NEXT:    movl %esi, (%ecx)
+; X86-BMI2-NEXT:    andl %ecx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    movl %esi, 4(%edi)
+; X86-BMI2-NEXT:    movl %ecx, (%edi)
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    retl
@@ -2810,84 +2753,80 @@ define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits, ptr %escape) nounwind {
 define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    movl $-1, %ebx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB38_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebx, %edi
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB38_2:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl 4(%eax), %edx
-; X86-NOBMI-NEXT:    andl %ebx, %edx
+; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    andl %edi, %eax
-; X86-NOBMI-NEXT:    movl %ebx, 4(%esi)
-; X86-NOBMI-NEXT:    movl %edi, (%esi)
+; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edi, 4(%ecx)
+; X86-NOBMI-NEXT:    movl %esi, (%ecx)
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_c3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl $-1, %esi
 ; X86-BMI1-NEXT:    movl $-1, %edi
-; X86-BMI1-NEXT:    movl $-1, %ebx
-; X86-BMI1-NEXT:    shrl %cl, %ebx
+; X86-BMI1-NEXT:    shrl %cl, %edi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    je .LBB38_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %ebx, %edi
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
+; X86-BMI1-NEXT:    movl %edi, %esi
+; X86-BMI1-NEXT:    xorl %edi, %edi
 ; X86-BMI1-NEXT:  .LBB38_2:
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl 4(%eax), %edx
-; X86-BMI1-NEXT:    andl %ebx, %edx
+; X86-BMI1-NEXT:    andl %edi, %edx
 ; X86-BMI1-NEXT:    movl (%eax), %eax
-; X86-BMI1-NEXT:    andl %edi, %eax
-; X86-BMI1-NEXT:    movl %ebx, 4(%esi)
-; X86-BMI1-NEXT:    movl %edi, (%esi)
+; X86-BMI1-NEXT:    andl %esi, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edi, 4(%ecx)
+; X86-BMI1-NEXT:    movl %esi, (%ecx)
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_c3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movb $64, %dl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %dl
-; X86-BMI2-NEXT:    movl $-1, %esi
-; X86-BMI2-NEXT:    shrxl %edx, %esi, %edi
-; X86-BMI2-NEXT:    testb $32, %dl
+; X86-BMI2-NEXT:    movb $64, %al
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %esi
+; X86-BMI2-NEXT:    testb $32, %al
 ; X86-BMI2-NEXT:    je .LBB38_2
 ; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI2-NEXT:    movl %esi, %ecx
+; X86-BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI2-NEXT:  .LBB38_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    movl 4(%eax), %edx
-; X86-BMI2-NEXT:    andl %edi, %edx
+; X86-BMI2-NEXT:    andl %esi, %edx
 ; X86-BMI2-NEXT:    movl (%eax), %eax
-; X86-BMI2-NEXT:    andl %esi, %eax
-; X86-BMI2-NEXT:    movl %edi, 4(%ecx)
-; X86-BMI2-NEXT:    movl %esi, (%ecx)
+; X86-BMI2-NEXT:    andl %ecx, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI2-NEXT:    movl %esi, 4(%edi)
+; X86-BMI2-NEXT:    movl %ecx, (%edi)
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
 ; X86-BMI2-NEXT:    retl
@@ -2936,67 +2875,71 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits, ptr %escape) nounwi
 define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits, ptr %escape) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB39_2
+; X86-NOBMI-NEXT:    jne .LBB39_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB39_2:
-; X86-NOBMI-NEXT:    movl %edx, 4(%esi)
-; X86-NOBMI-NEXT:    movl %eax, (%esi)
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %edx, 4(%ecx)
+; X86-NOBMI-NEXT:    jne .LBB39_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl $-1, %eax
+; X86-NOBMI-NEXT:  .LBB39_4:
+; X86-NOBMI-NEXT:    movl %eax, (%ecx)
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_c4_commutative:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1-NEXT:    movl $-1, %eax
-; X86-BMI1-NEXT:    movl $-1, %edx
-; X86-BMI1-NEXT:    shrl %cl, %edx
+; X86-BMI1-NEXT:    shrl %cl, %eax
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    je .LBB39_2
+; X86-BMI1-NEXT:    jne .LBB39_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %edx, %eax
-; X86-BMI1-NEXT:    xorl %edx, %edx
+; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB39_2:
-; X86-BMI1-NEXT:    movl %edx, 4(%esi)
-; X86-BMI1-NEXT:    movl %eax, (%esi)
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI1-NEXT:    jne .LBB39_4
+; X86-BMI1-NEXT:  # %bb.3:
+; X86-BMI1-NEXT:    movl $-1, %eax
+; X86-BMI1-NEXT:  .LBB39_4:
+; X86-BMI1-NEXT:    movl %eax, (%ecx)
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_c4_commutative:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    pushl %ebx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movb $64, %bl
-; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %bl
+; X86-BMI2-NEXT:    movb $64, %cl
+; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI2-NEXT:    movl $-1, %eax
-; X86-BMI2-NEXT:    shrxl %ebx, %eax, %edx
-; X86-BMI2-NEXT:    testb $32, %bl
-; X86-BMI2-NEXT:    je .LBB39_2
-; X86-BMI2-NEXT:  # %bb.1:
-; X86-BMI2-NEXT:    movl %edx, %eax
+; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI2-NEXT:    testb $32, %cl
+; X86-BMI2-NEXT:    jne .LBB39_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI2-NEXT:  .LBB39_2:
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-BMI2-NEXT:    jne .LBB39_4
+; X86-BMI2-NEXT:  # %bb.3:
+; X86-BMI2-NEXT:    movl $-1, %eax
+; X86-BMI2-NEXT:  .LBB39_4:
 ; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c4_commutative:
@@ -3112,9 +3055,9 @@ define i32 @bzhi64_32_c0(i64 %val, i64 %numlowbits) nounwind {
 define i32 @bzhi64_32_c1(i64 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_32_c1:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -3166,9 +3109,9 @@ define i32 @bzhi64_32_c1(i64 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi64_32_c2(i64 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_32_c2:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -3278,9 +3221,9 @@ define i32 @bzhi64_32_c3(i64 %val, i64 %numlowbits) nounwind {
 define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_d0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -3328,9 +3271,9 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_d1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -3390,17 +3333,17 @@ define i32 @bzhi32_d2_load(ptr %w, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1-LABEL: bzhi32_d2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_d2_load:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_d2_load:
@@ -3444,17 +3387,17 @@ define i32 @bzhi32_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1-LABEL: bzhi32_d3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1-NEXT:    shll $8, %ecx
-; X86-BMI1-NEXT:    bextrl %ecx, (%eax), %eax
+; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    shll $8, %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1-NEXT:    bextrl %eax, (%ecx), %eax
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi32_d3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    bzhil %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_d3_load_indexzext:
@@ -3490,112 +3433,108 @@ define i32 @bzhi32_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_d0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %eax
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:    jne .LBB48_2
+; X86-NOBMI-NEXT:    je .LBB48_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB48_2:
 ; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB48_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB48_4:
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB48_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:  .LBB48_6:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_d0:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl %eax, %esi
 ; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %esi, %edi
-; X86-BMI1-NEXT:    jne .LBB48_2
+; X86-BMI1-NEXT:    je .LBB48_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB48_2:
 ; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB48_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ebx
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB48_4:
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB48_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:  .LBB48_6:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_d0:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB48_2
-; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB48_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %esi
 ; X86-BMI2-NEXT:  .LBB48_2:
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    movl $0, %ebx
 ; X86-BMI2-NEXT:    jne .LBB48_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %edi, %ebx
 ; X86-BMI2-NEXT:  .LBB48_4:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB48_6
 ; X86-BMI2-NEXT:  # %bb.5:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI2-NEXT:  .LBB48_6:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_d0:
@@ -3627,112 +3566,108 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_d1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl %edx, %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %eax
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:    jne .LBB49_2
+; X86-NOBMI-NEXT:    je .LBB49_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB49_2:
 ; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB49_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB49_4:
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB49_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:  .LBB49_6:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_d1_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1-NEXT:    movl %eax, %esi
 ; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI1-NEXT:    shldl %cl, %eax, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %esi, %edi
-; X86-BMI1-NEXT:    jne .LBB49_2
+; X86-BMI1-NEXT:    je .LBB49_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB49_2:
 ; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB49_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ebx
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB49_4:
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB49_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:  .LBB49_6:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_d1_indexzext:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %eax, %esi
-; X86-BMI2-NEXT:    shlxl %ecx, %eax, %edi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI2-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB49_2
-; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB49_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %esi
 ; X86-BMI2-NEXT:  .LBB49_2:
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    movl $0, %ebx
 ; X86-BMI2-NEXT:    jne .LBB49_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %edi, %ebx
 ; X86-BMI2-NEXT:  .LBB49_4:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB49_6
 ; X86-BMI2-NEXT:  # %bb.5:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI2-NEXT:  .LBB49_6:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_d1_indexzext:
@@ -3767,115 +3702,111 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_d2_load:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %edx
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl (%eax), %edx
+; X86-NOBMI-NEXT:    movl 4(%eax), %edi
 ; X86-NOBMI-NEXT:    movl %edx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %eax
+; X86-NOBMI-NEXT:    shldl %cl, %edx, %edi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:    jne .LBB50_2
+; X86-NOBMI-NEXT:    je .LBB50_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB50_2:
 ; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB50_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB50_4:
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB50_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:  .LBB50_6:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_d2_load:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %edx
-; X86-BMI1-NEXT:    movl 4(%eax), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl (%eax), %edx
+; X86-BMI1-NEXT:    movl 4(%eax), %edi
 ; X86-BMI1-NEXT:    movl %edx, %esi
 ; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI1-NEXT:    shldl %cl, %edx, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %esi, %edi
-; X86-BMI1-NEXT:    jne .LBB50_2
+; X86-BMI1-NEXT:    je .LBB50_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB50_2:
 ; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB50_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ebx
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB50_4:
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB50_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:  .LBB50_6:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_d2_load:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl (%eax), %edx
-; X86-BMI2-NEXT:    movl 4(%eax), %esi
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %edx, %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl (%eax), %edx
+; X86-BMI2-NEXT:    movl 4(%eax), %eax
+; X86-BMI2-NEXT:    shldl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB50_2
-; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB50_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %esi
 ; X86-BMI2-NEXT:  .LBB50_2:
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    movl $0, %ebx
 ; X86-BMI2-NEXT:    jne .LBB50_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %edi, %ebx
 ; X86-BMI2-NEXT:  .LBB50_4:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB50_6
 ; X86-BMI2-NEXT:  # %bb.5:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI2-NEXT:  .LBB50_6:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_d2_load:
@@ -3908,115 +3839,111 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_d3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %edx
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl (%eax), %edx
+; X86-NOBMI-NEXT:    movl 4(%eax), %edi
 ; X86-NOBMI-NEXT:    movl %edx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    shldl %cl, %edx, %eax
+; X86-NOBMI-NEXT:    shldl %cl, %edx, %edi
+; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:    jne .LBB51_2
+; X86-NOBMI-NEXT:    je .LBB51_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:  .LBB51_2:
 ; X86-NOBMI-NEXT:    movl %edi, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:    jne .LBB51_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:  .LBB51_4:
-; X86-NOBMI-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    jne .LBB51_6
 ; X86-NOBMI-NEXT:  # %bb.5:
-; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    movl %esi, %eax
 ; X86-NOBMI-NEXT:  .LBB51_6:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
-; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1-LABEL: bzhi64_d3_load_indexzext:
 ; X86-BMI1:       # %bb.0:
-; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl (%eax), %edx
-; X86-BMI1-NEXT:    movl 4(%eax), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1-NEXT:    movl (%eax), %edx
+; X86-BMI1-NEXT:    movl 4(%eax), %edi
 ; X86-BMI1-NEXT:    movl %edx, %esi
 ; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    shldl %cl, %edx, %eax
+; X86-BMI1-NEXT:    shldl %cl, %edx, %edi
+; X86-BMI1-NEXT:    xorl %edx, %edx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl %esi, %edi
-; X86-BMI1-NEXT:    jne .LBB51_2
+; X86-BMI1-NEXT:    je .LBB51_2
 ; X86-BMI1-NEXT:  # %bb.1:
-; X86-BMI1-NEXT:    movl %eax, %edi
+; X86-BMI1-NEXT:    movl %esi, %edi
+; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB51_2:
 ; X86-BMI1-NEXT:    movl %edi, %eax
 ; X86-BMI1-NEXT:    shrl %cl, %eax
-; X86-BMI1-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1-NEXT:    testb $32, %cl
-; X86-BMI1-NEXT:    movl $0, %edx
 ; X86-BMI1-NEXT:    jne .LBB51_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %esi, %ebx
 ; X86-BMI1-NEXT:    movl %eax, %edx
 ; X86-BMI1-NEXT:  .LBB51_4:
-; X86-BMI1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-BMI1-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1-NEXT:    testb $32, %cl
 ; X86-BMI1-NEXT:    jne .LBB51_6
 ; X86-BMI1-NEXT:  # %bb.5:
-; X86-BMI1-NEXT:    movl %ebx, %eax
+; X86-BMI1-NEXT:    movl %esi, %eax
 ; X86-BMI1-NEXT:  .LBB51_6:
 ; X86-BMI1-NEXT:    popl %esi
 ; X86-BMI1-NEXT:    popl %edi
-; X86-BMI1-NEXT:    popl %ebx
 ; X86-BMI1-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: bzhi64_d3_load_indexzext:
 ; X86-BMI2:       # %bb.0:
+; X86-BMI2-NEXT:    pushl %ebx
 ; X86-BMI2-NEXT:    pushl %edi
 ; X86-BMI2-NEXT:    pushl %esi
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl (%eax), %edx
-; X86-BMI2-NEXT:    movl 4(%eax), %esi
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI2-NEXT:    shldl %cl, %edx, %esi
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl (%eax), %edx
+; X86-BMI2-NEXT:    movl 4(%eax), %eax
+; X86-BMI2-NEXT:    shldl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    shlxl %ecx, %edx, %edi
 ; X86-BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
-; X86-BMI2-NEXT:    je .LBB51_2
-; X86-BMI2-NEXT:  # %bb.1:
 ; X86-BMI2-NEXT:    movl %edi, %esi
-; X86-BMI2-NEXT:    movl $0, %edi
+; X86-BMI2-NEXT:    jne .LBB51_2
+; X86-BMI2-NEXT:  # %bb.1:
+; X86-BMI2-NEXT:    movl %eax, %esi
 ; X86-BMI2-NEXT:  .LBB51_2:
 ; X86-BMI2-NEXT:    shrxl %ecx, %esi, %eax
+; X86-BMI2-NEXT:    movl $0, %ebx
 ; X86-BMI2-NEXT:    jne .LBB51_4
 ; X86-BMI2-NEXT:  # %bb.3:
 ; X86-BMI2-NEXT:    movl %eax, %edx
+; X86-BMI2-NEXT:    movl %edi, %ebx
 ; X86-BMI2-NEXT:  .LBB51_4:
-; X86-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI2-NEXT:    testb $32, %cl
 ; X86-BMI2-NEXT:    jne .LBB51_6
 ; X86-BMI2-NEXT:  # %bb.5:
-; X86-BMI2-NEXT:    movl %edi, %eax
+; X86-BMI2-NEXT:    movl %ebx, %eax
 ; X86-BMI2-NEXT:  .LBB51_6:
 ; X86-BMI2-NEXT:    popl %esi
 ; X86-BMI2-NEXT:    popl %edi
+; X86-BMI2-NEXT:    popl %ebx
 ; X86-BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_d3_load_indexzext:
@@ -4056,10 +3983,10 @@ define i32 @bzhi64_32_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_32_d0:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl %esi, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %edx
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %eax
@@ -4082,10 +4009,10 @@ define i32 @bzhi64_32_d0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-BMI1-LABEL: bzhi64_32_d0:
 ; X86-BMI1:       # %bb.0:
 ; X86-BMI1-NEXT:    pushl %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movb $64, %cl
 ; X86-BMI1-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1-NEXT:    movl %esi, %edx
 ; X86-BMI1-NEXT:    shll %cl, %edx
 ; X86-BMI1-NEXT:    shldl %cl, %esi, %eax
@@ -4107,10 +4034,10 @@ define i32 @bzhi64_32_d0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X86-BMI2-LABEL: bzhi64_32_d0:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $64, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -4161,9 +4088,9 @@ define i32 @bzhi64_32_d0(i64 %val, i64 %numlowbits) nounwind {
 define i32 @bzhi64_32_d1(i64 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_32_d1:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
@@ -4323,9 +4250,9 @@ define i32 @bzhi32_constant_mask8_load(ptr %val) nounwind {
 define i64 @bzhi64_constant_mask64(i64 %val) nounwind {
 ; X86-LABEL: bzhi64_constant_mask64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $1073741823, %edx # imm = 0x3FFFFFFF
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_constant_mask64:
@@ -4362,10 +4289,10 @@ define i64 @bzhi64_constant_mask64(i64 %val) nounwind {
 define i64 @bzhi64_constant_mask64_load(ptr %val) nounwind {
 ; X86-LABEL: bzhi64_constant_mask64_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $1073741823, %edx # imm = 0x3FFFFFFF
-; X86-NEXT:    andl 4(%ecx), %edx
+; X86-NEXT:    andl 4(%eax), %edx
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_constant_mask64_load:
@@ -4512,26 +4439,24 @@ define i64 @bzhi64_constant_mask8_load(ptr %val) nounwind {
 define void @PR111323(ptr nocapture noundef writeonly %use, i64 noundef %x, i64 noundef %y) nounwind {
 ; X86-LABEL: PR111323:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    je .LBB68_2
 ; X86-NEXT:  # %bb.1: # %if.end
-; X86-NEXT:    andl $65535, %eax # imm = 0xFFFF
 ; X86-NEXT:    andl $-1, %edx
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    andl $65535, %ecx # imm = 0xFFFF
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:  .LBB68_2: # %return
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: PR111323:
diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll
index 255ea44e520c0..ef60d67a48f43 100644
--- a/llvm/test/CodeGen/X86/extract-store.ll
+++ b/llvm/test/CodeGen/X86/extract-store.ll
@@ -11,9 +11,9 @@
 define void @extract_i8_0(ptr nocapture %dst, <16 x i8> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_i8_0:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-X86-NEXT:    movd %xmm0, %ecx
-; SSE2-X86-NEXT:    movb %cl, (%eax)
+; SSE2-X86-NEXT:    movd %xmm0, %eax
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-X86-NEXT:    movb %al, (%ecx)
 ; SSE2-X86-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i8_0:
@@ -51,10 +51,10 @@ define void @extract_i8_0(ptr nocapture %dst, <16 x i8> %foo) nounwind {
 define void @extract_i8_3(ptr nocapture %dst, <16 x i8> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_i8_3:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-X86-NEXT:    movd %xmm0, %ecx
-; SSE2-X86-NEXT:    shrl $24, %ecx
-; SSE2-X86-NEXT:    movb %cl, (%eax)
+; SSE2-X86-NEXT:    movd %xmm0, %eax
+; SSE2-X86-NEXT:    shrl $24, %eax
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-X86-NEXT:    movb %al, (%ecx)
 ; SSE2-X86-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i8_3:
@@ -93,9 +93,9 @@ define void @extract_i8_3(ptr nocapture %dst, <16 x i8> %foo) nounwind {
 define void @extract_i8_15(ptr nocapture %dst, <16 x i8> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_i8_15:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-X86-NEXT:    pextrw $7, %xmm0, %ecx
-; SSE2-X86-NEXT:    movb %ch, (%eax)
+; SSE2-X86-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-X86-NEXT:    movb %ah, (%ecx)
 ; SSE2-X86-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i8_15:
@@ -133,9 +133,9 @@ define void @extract_i8_15(ptr nocapture %dst, <16 x i8> %foo) nounwind {
 define void @extract_i16_0(ptr nocapture %dst, <8 x i16> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_i16_0:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-X86-NEXT:    movd %xmm0, %ecx
-; SSE2-X86-NEXT:    movw %cx, (%eax)
+; SSE2-X86-NEXT:    movd %xmm0, %eax
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-X86-NEXT:    movw %ax, (%ecx)
 ; SSE2-X86-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i16_0:
@@ -173,9 +173,9 @@ define void @extract_i16_0(ptr nocapture %dst, <8 x i16> %foo) nounwind {
 define void @extract_i16_7(ptr nocapture %dst, <8 x i16> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_i16_7:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-X86-NEXT:    pextrw $7, %xmm0, %ecx
-; SSE2-X86-NEXT:    movw %cx, (%eax)
+; SSE2-X86-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE2-X86-NEXT:    movw %ax, (%ecx)
 ; SSE2-X86-NEXT:    retl
 ;
 ; SSE2-X64-LABEL: extract_i16_7:
@@ -240,8 +240,8 @@ define void @extract_i32_0(ptr nocapture %dst, <4 x i32> %foo) nounwind {
 define void @extract_i32_3(ptr nocapture %dst, <4 x i32> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_i32_3:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-X86-NEXT:    movd %xmm0, (%eax)
 ; SSE2-X86-NEXT:    retl
 ;
@@ -307,8 +307,8 @@ define void @extract_i64_0(ptr nocapture %dst, <2 x i64> %foo) nounwind {
 define void @extract_i64_1(ptr nocapture %dst, <2 x i64> %foo) nounwind {
 ; SSE-X86-LABEL: extract_i64_1:
 ; SSE-X86:       # %bb.0:
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    movq %xmm0, (%eax)
 ; SSE-X86-NEXT:    retl
 ;
@@ -325,8 +325,8 @@ define void @extract_i64_1(ptr nocapture %dst, <2 x i64> %foo) nounwind {
 ;
 ; AVX-X86-LABEL: extract_i64_1:
 ; AVX-X86:       # %bb.0:
-; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-X86-NEXT:    vmovlps %xmm0, (%eax)
 ; AVX-X86-NEXT:    retl
 ;
@@ -369,8 +369,8 @@ define void @extract_f32_0(ptr nocapture %dst, <4 x float> %foo) nounwind {
 define void @extract_f32_3(ptr nocapture %dst, <4 x float> %foo) nounwind {
 ; SSE2-X86-LABEL: extract_f32_3:
 ; SSE2-X86:       # %bb.0:
-; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-X86-NEXT:    movss %xmm0, (%eax)
 ; SSE2-X86-NEXT:    retl
 ;
@@ -463,19 +463,15 @@ define void @extract_f64_1(ptr nocapture %dst, <2 x double> %foo) nounwind {
 define void @extract_f128_0(ptr nocapture %dst, <2 x fp128> %foo) nounwind {
 ; SSE-X86-LABEL: extract_f128_0:
 ; SSE-X86:       # %bb.0:
-; SSE-X86-NEXT:    pushl %edi
-; SSE-X86-NEXT:    pushl %esi
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SSE-X86-NEXT:    movl %esi, 12(%edi)
-; SSE-X86-NEXT:    movl %edx, 8(%edi)
-; SSE-X86-NEXT:    movl %ecx, 4(%edi)
-; SSE-X86-NEXT:    movl %eax, (%edi)
-; SSE-X86-NEXT:    popl %esi
-; SSE-X86-NEXT:    popl %edi
+; SSE-X86-NEXT:    movl %ecx, 12(%edx)
+; SSE-X86-NEXT:    movl %eax, 8(%edx)
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE-X86-NEXT:    movl %ecx, 4(%edx)
+; SSE-X86-NEXT:    movl %eax, (%edx)
 ; SSE-X86-NEXT:    retl
 ;
 ; SSE-X64-LABEL: extract_f128_0:
@@ -502,19 +498,15 @@ define void @extract_f128_0(ptr nocapture %dst, <2 x fp128> %foo) nounwind {
 define void @extract_f128_1(ptr nocapture %dst, <2 x fp128> %foo) nounwind {
 ; SSE-X86-LABEL: extract_f128_1:
 ; SSE-X86:       # %bb.0:
-; SSE-X86-NEXT:    pushl %edi
-; SSE-X86-NEXT:    pushl %esi
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SSE-X86-NEXT:    movl %esi, 12(%edi)
-; SSE-X86-NEXT:    movl %edx, 8(%edi)
-; SSE-X86-NEXT:    movl %ecx, 4(%edi)
-; SSE-X86-NEXT:    movl %eax, (%edi)
-; SSE-X86-NEXT:    popl %esi
-; SSE-X86-NEXT:    popl %edi
+; SSE-X86-NEXT:    movl %ecx, 12(%edx)
+; SSE-X86-NEXT:    movl %eax, 8(%edx)
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SSE-X86-NEXT:    movl %ecx, 4(%edx)
+; SSE-X86-NEXT:    movl %eax, (%edx)
 ; SSE-X86-NEXT:    retl
 ;
 ; SSE-X64-LABEL: extract_f128_1:
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 9ce1fd6f20976..b168e4485a561 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -353,9 +353,9 @@ define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z,
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovaps 8(%ebp), %xmm3
 ; X86-NEXT:    vcmpneq_oqss %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vmovaps 8(%ebp), %xmm1
+; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 0e0cfc64af9ee..9c9b09df7088e 100644
--- a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -14,19 +14,19 @@ define void @test_extractelement_legalization_storereuse(<4 x i32> %a, ptr nocap
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    paddd (%edx), %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%edx)
-; CHECK-NEXT:    movl (%edx), %esi
-; CHECK-NEXT:    movl 4(%edx), %edi
-; CHECK-NEXT:    shll $4, %ecx
-; CHECK-NEXT:    movl 8(%edx), %ebx
-; CHECK-NEXT:    movl 12(%edx), %edx
-; CHECK-NEXT:    movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT:    movl %edi, (%eax,%ecx)
-; CHECK-NEXT:    movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT:    movl %edx, 4(%eax,%ecx)
+; CHECK-NEXT:    paddd (%eax), %xmm0
+; CHECK-NEXT:    movdqa %xmm0, (%eax)
+; CHECK-NEXT:    movl 12(%eax), %ecx
+; CHECK-NEXT:    movl 8(%eax), %edx
+; CHECK-NEXT:    movl (%eax), %esi
+; CHECK-NEXT:    movl 4(%eax), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    shll $4, %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl %esi, 12(%ebx,%edi)
+; CHECK-NEXT:    movl %eax, (%ebx,%edi)
+; CHECK-NEXT:    movl %edx, 8(%ebx,%edi)
+; CHECK-NEXT:    movl %ecx, 4(%ebx,%edi)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index ce68eebd5b752..80b429d673b21 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -97,8 +97,8 @@ define void @t5(ptr%a0, ptr%a1) {
 ; X86-SSE2-LABEL: t5:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE2-NEXT:    movaps (%eax), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movhps %xmm0, (%eax)
 ; X86-SSE2-NEXT:    retl
 ;
@@ -178,11 +178,11 @@ define void @PR43971(ptr%a0, ptr%a1) {
 ; X86-SSE2-LABEL: PR43971:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movaps 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movaps 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; X86-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE2-NEXT:    cmpltss %xmm0, %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    andps %xmm1, %xmm2
 ; X86-SSE2-NEXT:    andnps %xmm0, %xmm1
@@ -310,10 +310,10 @@ define void @subextract_broadcast_load_constant(ptr nocapture %0, ptr nocapture
 ; X86-SSE2-LABEL: subextract_broadcast_load_constant:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl $-1583308898, (%edx) # imm = 0xA1A09F9E
-; X86-SSE2-NEXT:    movw $-24674, (%ecx) # imm = 0x9F9E
+; X86-SSE2-NEXT:    movl $-1583308898, (%eax) # imm = 0xA1A09F9E
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movw $-24674, (%eax) # imm = 0x9F9E
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movw $-24160, (%eax) # imm = 0xA1A0
 ; X86-SSE2-NEXT:    retl
 ;
@@ -344,10 +344,10 @@ define i32 @multi_use_load_scalarization(ptr %p) nounwind {
 ; X86-SSE2-LABEL: multi_use_load_scalarization:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %eax
 ; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-SSE2-NEXT:    psubd %xmm1, %xmm0
+; X86-SSE2-NEXT:    movl (%ecx), %eax
 ; X86-SSE2-NEXT:    movdqa %xmm0, (%ecx)
 ; X86-SSE2-NEXT:    retl
 ;
@@ -428,15 +428,15 @@ define i32 @main() nounwind {
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-32, %esp
 ; X86-SSE2-NEXT:    subl $64, %esp
-; X86-SSE2-NEXT:    movaps n1+16, %xmm0
-; X86-SSE2-NEXT:    movaps n1, %xmm1
-; X86-SSE2-NEXT:    movl zero+4, %ecx
-; X86-SSE2-NEXT:    movl zero+8, %eax
-; X86-SSE2-NEXT:    movaps %xmm1, zero
-; X86-SSE2-NEXT:    movaps %xmm0, zero+16
 ; X86-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [2,2,2,2]
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movl zero+4, %ecx
+; X86-SSE2-NEXT:    movl zero+8, %eax
+; X86-SSE2-NEXT:    movaps n1, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, zero
+; X86-SSE2-NEXT:    movaps n1+16, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, zero+16
 ; X86-SSE2-NEXT:    movdqa (%esp), %xmm0
 ; X86-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -538,10 +538,10 @@ define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE2-NEXT:    movups 24(%esi), %xmm0
 ; X86-SSE2-NEXT:    movups %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movhps %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movaps 32(%esi), %xmm0
 ; X86-SSE2-NEXT:    calll ccosf at PLT
diff --git a/llvm/test/CodeGen/X86/fildll.ll b/llvm/test/CodeGen/X86/fildll.ll
index 43c5525f6068c..cadccdfecea9a 100644
--- a/llvm/test/CodeGen/X86/fildll.ll
+++ b/llvm/test/CodeGen/X86/fildll.ll
@@ -32,8 +32,8 @@ define fastcc double @uint64_to_fp(i64 %X) {
 ; CHECK-NEXT:    .cfi_def_cfa_register %ebp
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $16, %esp
-; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ecx, (%esp)
+; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    shrl $31, %edx
 ; CHECK-NEXT:    fildll (%esp)
 ; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%edx,4)
diff --git a/llvm/test/CodeGen/X86/fixup-bw-copy.ll b/llvm/test/CodeGen/X86/fixup-bw-copy.ll
index 2af90469f4cce..6f4dfb4a3d670 100644
--- a/llvm/test/CodeGen/X86/fixup-bw-copy.ll
+++ b/llvm/test/CodeGen/X86/fixup-bw-copy.ll
@@ -86,8 +86,8 @@ define i8 @test_movb_hreg(i16 %a0) {
 ; X86-LABEL: test_movb_hreg:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %ah
-; X86-NEXT:    movb %ah, %al
+; X86-NEXT:    addb %ah, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
   %tmp0 = trunc i16 %a0 to i8
   %tmp1 = lshr i16 %a0, 8
diff --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll
index 548e74de39295..0fcfb560a89b4 100644
--- a/llvm/test/CodeGen/X86/fixup-lea.ll
+++ b/llvm/test/CodeGen/X86/fixup-lea.ll
@@ -12,6 +12,7 @@ define void @foo(i32 inreg %dns) minsize {
 ; CHECK-NEXT:    movzwl %cx, %edx
 ; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; CHECK-NEXT:    jl .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    retl
@@ -39,6 +40,7 @@ define void @bar(i32 inreg %dns) minsize {
 ; CHECK-NEXT:    movzwl %cx, %edx
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; CHECK-NEXT:    jl .LBB1_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    retl
@@ -65,6 +67,7 @@ define void @foo_optsize(i32 inreg %dns) optsize {
 ; CHECK-NEXT:    movzwl %cx, %edx
 ; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; CHECK-NEXT:    jl .LBB2_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    retl
@@ -92,6 +95,7 @@ define void @bar_optsize(i32 inreg %dns) optsize {
 ; CHECK-NEXT:    movzwl %cx, %edx
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; CHECK-NEXT:    jl .LBB3_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    retl
@@ -118,6 +122,7 @@ define void @foo_pgso(i32 inreg %dns) !prof !14 {
 ; CHECK-NEXT:    movzwl %cx, %edx
 ; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; CHECK-NEXT:    jl .LBB4_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    retl
@@ -145,6 +150,7 @@ define void @bar_pgso(i32 inreg %dns) !prof !14 {
 ; CHECK-NEXT:    movzwl %cx, %edx
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    cmpl %eax, %edx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; CHECK-NEXT:    jl .LBB5_1
 ; CHECK-NEXT:  # %bb.2: # %for.end
 ; CHECK-NEXT:    retl
@@ -171,6 +177,7 @@ define void @foo_nosize(i32 inreg %dns) {
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    decl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; SLOW-NEXT:    jl .LBB6_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
@@ -184,6 +191,7 @@ define void @foo_nosize(i32 inreg %dns) {
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $-1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; FAST-NEXT:    jl .LBB6_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
@@ -211,6 +219,7 @@ define void @bar_nosize(i32 inreg %dns) {
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    incl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; SLOW-NEXT:    jl .LBB7_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
@@ -224,6 +233,7 @@ define void @bar_nosize(i32 inreg %dns) {
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    # kill: def $cx killed $cx killed $ecx def $ecx
 ; FAST-NEXT:    jl .LBB7_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/flt-rounds.ll b/llvm/test/CodeGen/X86/flt-rounds.ll
index 1d7a8d8456c27..6575b77fec56c 100644
--- a/llvm/test/CodeGen/X86/flt-rounds.ll
+++ b/llvm/test/CodeGen/X86/flt-rounds.ll
@@ -42,50 +42,52 @@ define i32 @multiple_flt_rounds() nounwind {
 ; SDAG-X86-LABEL: multiple_flt_rounds:
 ; SDAG-X86:       # %bb.0: # %entry
 ; SDAG-X86-NEXT:    pushl %ebx
+; SDAG-X86-NEXT:    pushl %edi
 ; SDAG-X86-NEXT:    pushl %esi
-; SDAG-X86-NEXT:    subl $20, %esp
+; SDAG-X86-NEXT:    subl $16, %esp
 ; SDAG-X86-NEXT:    movl $1024, (%esp) # imm = 0x400
 ; SDAG-X86-NEXT:    calll fesetround
 ; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; SDAG-X86-NEXT:    shrl $9, %ecx
-; SDAG-X86-NEXT:    andb $6, %cl
-; SDAG-X86-NEXT:    movl $45, %esi
-; SDAG-X86-NEXT:    movl $45, %eax
-; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SDAG-X86-NEXT:    shrl %cl, %eax
-; SDAG-X86-NEXT:    andl $3, %eax
-; SDAG-X86-NEXT:    xorl %ebx, %ebx
-; SDAG-X86-NEXT:    cmpl $3, %eax
-; SDAG-X86-NEXT:    setne %bl
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
 ; SDAG-X86-NEXT:    movl $0, (%esp)
 ; SDAG-X86-NEXT:    calll fesetround
 ; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; SDAG-X86-NEXT:    movl $3072, (%esp) # imm = 0xC00
+; SDAG-X86-NEXT:    calll fesetround
+; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl $2048, (%esp) # imm = 0x800
+; SDAG-X86-NEXT:    shrl $9, %ebx
+; SDAG-X86-NEXT:    andb $6, %bl
+; SDAG-X86-NEXT:    movl $45, %esi
+; SDAG-X86-NEXT:    movl $45, %edx
+; SDAG-X86-NEXT:    movl %ebx, %ecx
+; SDAG-X86-NEXT:    shrl %cl, %edx
+; SDAG-X86-NEXT:    andl $3, %edx
+; SDAG-X86-NEXT:    xorl %ebx, %ebx
+; SDAG-X86-NEXT:    cmpl $3, %edx
+; SDAG-X86-NEXT:    setne %bl
+; SDAG-X86-NEXT:    movl %edi, %ecx
 ; SDAG-X86-NEXT:    shrl $9, %ecx
 ; SDAG-X86-NEXT:    andb $6, %cl
-; SDAG-X86-NEXT:    movl $45, %eax
+; SDAG-X86-NEXT:    movl $45, %edx
 ; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SDAG-X86-NEXT:    shrl %cl, %eax
-; SDAG-X86-NEXT:    andl $3, %eax
-; SDAG-X86-NEXT:    cmpl $1, %eax
+; SDAG-X86-NEXT:    shrl %cl, %edx
+; SDAG-X86-NEXT:    andl $3, %edx
+; SDAG-X86-NEXT:    cmpl $1, %edx
 ; SDAG-X86-NEXT:    je .LBB1_2
 ; SDAG-X86-NEXT:  # %bb.1: # %entry
 ; SDAG-X86-NEXT:    incl %ebx
 ; SDAG-X86-NEXT:  .LBB1_2: # %entry
-; SDAG-X86-NEXT:    movl $3072, (%esp) # imm = 0xC00
-; SDAG-X86-NEXT:    calll fesetround
-; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; SDAG-X86-NEXT:    shrl $9, %ecx
-; SDAG-X86-NEXT:    andb $6, %cl
-; SDAG-X86-NEXT:    movl $45, %eax
-; SDAG-X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SDAG-X86-NEXT:    shrl %cl, %eax
-; SDAG-X86-NEXT:    andl $3, %eax
-; SDAG-X86-NEXT:    cmpl $1, %eax
+; SDAG-X86-NEXT:    shrl $9, %eax
+; SDAG-X86-NEXT:    andb $6, %al
+; SDAG-X86-NEXT:    movl $45, %edx
+; SDAG-X86-NEXT:    movl %eax, %ecx
+; SDAG-X86-NEXT:    shrl %cl, %edx
+; SDAG-X86-NEXT:    andl $3, %edx
+; SDAG-X86-NEXT:    cmpl $1, %edx
 ; SDAG-X86-NEXT:    sbbl $-1, %ebx
-; SDAG-X86-NEXT:    movl $2048, (%esp) # imm = 0x800
 ; SDAG-X86-NEXT:    calll fesetround
 ; SDAG-X86-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -101,8 +103,9 @@ define i32 @multiple_flt_rounds() nounwind {
 ; SDAG-X86-NEXT:    xorl %eax, %eax
 ; SDAG-X86-NEXT:    cmpl %ecx, %ebx
 ; SDAG-X86-NEXT:    setne %al
-; SDAG-X86-NEXT:    addl $20, %esp
+; SDAG-X86-NEXT:    addl $16, %esp
 ; SDAG-X86-NEXT:    popl %esi
+; SDAG-X86-NEXT:    popl %edi
 ; SDAG-X86-NEXT:    popl %ebx
 ; SDAG-X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll
index 6d865628ba715..8d2d87585a55b 100644
--- a/llvm/test/CodeGen/X86/fma.ll
+++ b/llvm/test/CodeGen/X86/fma.ll
@@ -163,11 +163,11 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
 ; FMA32-LABEL: test_f80:
 ; FMA32:       ## %bb.0: ## %entry
 ; FMA32-NEXT:    subl $60, %esp ## encoding: [0x83,0xec,0x3c]
-; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
-; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
 ; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x60]
 ; FMA32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
 ; FMA32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
 ; FMA32-NEXT:    fstpt (%esp) ## encoding: [0xdb,0x3c,0x24]
 ; FMA32-NEXT:    calll _fmal ## encoding: [0xe8,A,A,A,A]
 ; FMA32-NEXT:    ## fixup A - offset: 1, value: _fmal, kind: FK_PCRel_4
@@ -177,11 +177,11 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
 ; FMACALL32-LABEL: test_f80:
 ; FMACALL32:       ## %bb.0: ## %entry
 ; FMACALL32-NEXT:    subl $60, %esp ## encoding: [0x83,0xec,0x3c]
-; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
-; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
 ; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x60]
 ; FMACALL32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
 ; FMACALL32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
 ; FMACALL32-NEXT:    fstpt (%esp) ## encoding: [0xdb,0x3c,0x24]
 ; FMACALL32-NEXT:    calll _fmal ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmal, kind: FK_PCRel_4
diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll
index 16ebf70126f8b..812940cfc306b 100644
--- a/llvm/test/CodeGen/X86/fmf-flags.ll
+++ b/llvm/test/CodeGen/X86/fmf-flags.ll
@@ -62,10 +62,8 @@ define dso_local double @not_so_fast_mul_add(double %x) {
 ; X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NEXT:    fld %st(0)
 ; X86-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-NEXT:    fxch %st(1)
-; X86-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:    fstpl mul1
+; X86-NEXT:    fmull {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-NEXT:    retl
   %m = fmul double %x, 4.2
   %a = fadd fast double %m, %x
@@ -94,10 +92,9 @@ define dso_local float @not_so_fast_recip_sqrt(float %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fsqrt
+; X86-NEXT:    fsts sqrt1
 ; X86-NEXT:    fld1
-; X86-NEXT:    fdiv %st(1), %st
-; X86-NEXT:    fxch %st(1)
-; X86-NEXT:    fstps sqrt1
+; X86-NEXT:    fdivp %st, %st(1)
 ; X86-NEXT:    retl
   %y = call float @llvm.sqrt.f32(float %x)
   %z = fdiv fast float 1.0, %y
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index a56758d73165e..7d468aac9aba7 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -236,11 +236,11 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm2
-; X86-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vmaxss %xmm0, %xmm2, %xmm0
-; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm1
-; X86-NEXT:    vandps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vsubss %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -292,11 +292,11 @@ define double @test_fmaximum_zero0(double %x, double %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -349,11 +349,11 @@ define double @test_fmaximum_zero1(double %x, double %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovlpd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -2148,22 +2148,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ;
 ; X86-LABEL: test_fmaximum_v4f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $164, %esp
-; X86-NEXT:    vmovdqa %xmm0, %xmm2
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -2173,7 +2162,8 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovshdup {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = mem[1,1,3,3]
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
@@ -2205,17 +2195,21 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
@@ -2241,17 +2235,17 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm1, (%esp)
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; X86-NEXT:    addl $164, %esp
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vinsertps $28, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
+; X86-NEXT:    addl $112, %esp
 ; X86-NEXT:    retl
   %r = call <4 x half> @llvm.maximum.v4f16(<4 x half> %x, <4 x half> %y)
   ret <4 x half> %r
@@ -2841,47 +2835,31 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ;
 ; X86-LABEL: test_fmaximum_v4bf16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $68, %esp
+; X86-NEXT:    subl $84, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 88
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm2, %esi
-; X86-NEXT:    vpsrlq $48, %xmm1, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm2, %edi
-; X86-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-NEXT:    vpextrw $0, %xmm2, %ebx
-; X86-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-NEXT:    vpextrw $0, %xmm2, %ebp
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
+; X86-NEXT:    vmovdqa %xmm1, %xmm5
+; X86-NEXT:    vmovdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm2
 ; X86-NEXT:    vpextrw $0, %xmm0, %eax
-; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-NEXT:    vpsrld $16, %xmm1, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm2, %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm0, %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
-; X86-NEXT:    vpextrw $0, %xmm1, %ecx
-; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm0, %xmm4
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
-; X86-NEXT:    vorps %xmm3, %xmm0, %xmm2
-; X86-NEXT:    vandps %xmm1, %xmm2, %xmm1
-; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vorps %xmm3, %xmm1, %xmm2
+; X86-NEXT:    vandps %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm0
@@ -2894,10 +2872,16 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    shll $16, %ebp
-; X86-NEXT:    vmovd %ebp, %xmm0
-; X86-NEXT:    shll $16, %ebx
-; X86-NEXT:    vmovd %ebx, %xmm1
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm1
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm2
 ; X86-NEXT:    vandps %xmm0, %xmm2, %xmm0
@@ -2908,10 +2892,16 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    shll $16, %edi
-; X86-NEXT:    vmovd %edi, %xmm0
-; X86-NEXT:    shll $16, %esi
-; X86-NEXT:    vmovd %esi, %xmm1
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm1
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm2
 ; X86-NEXT:    vandps %xmm0, %xmm2, %xmm0
@@ -2919,25 +2909,17 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
 ; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
-; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm1, (%esp)
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; X86-NEXT:    addl $68, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vinsertps $28, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
+; X86-NEXT:    addl $84, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %r = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 8abd68701676c..90eef6a4957ae 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -235,11 +235,11 @@ define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm2
-; X86-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vmaxss %xmm0, %xmm2, %xmm0
-; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm1
-; X86-NEXT:    vandps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vsubss %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -276,9 +276,9 @@ define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -313,9 +313,9 @@ define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -2051,22 +2051,11 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ;
 ; X86-LABEL: test_fmaximumnum_v4f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $164, %esp
-; X86-NEXT:    vmovdqa %xmm0, %xmm2
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -2076,7 +2065,8 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovshdup {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = mem[1,1,3,3]
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
@@ -2110,17 +2100,21 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
@@ -2148,17 +2142,17 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm1, (%esp)
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; X86-NEXT:    addl $164, %esp
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vinsertps $28, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
+; X86-NEXT:    addl $112, %esp
 ; X86-NEXT:    retl
   %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
   ret <4 x half> %r
@@ -2845,42 +2839,34 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ;
 ; X86-LABEL: test_fmaximumnum_v4bf16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $68, %esp
-; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm2, %esi
-; X86-NEXT:    vpsrlq $48, %xmm1, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm2, %edi
-; X86-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-NEXT:    vpextrw $0, %xmm2, %ebx
-; X86-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-NEXT:    vpextrw $0, %xmm2, %ebp
+; X86-NEXT:    subl $84, %esp
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
+; X86-NEXT:    vmovdqa %xmm1, %xmm7
+; X86-NEXT:    vmovdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm4
 ; X86-NEXT:    vpextrw $0, %xmm0, %eax
-; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
-; X86-NEXT:    vpsrld $16, %xmm1, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm2, %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm2
-; X86-NEXT:    vpextrw $0, %xmm0, %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
-; X86-NEXT:    vpextrw $0, %xmm1, %ecx
-; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
-; X86-NEXT:    vorps %xmm4, %xmm0, %xmm3
-; X86-NEXT:    vandps %xmm1, %xmm3, %xmm1
-; X86-NEXT:    vcmpunordss %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovdqa %xmm0, %xmm6
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    vmaxss %xmm4, %xmm1, %xmm2
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm5 = [NaN,NaN,NaN,NaN]
+; X86-NEXT:    vorps %xmm5, %xmm1, %xmm3
+; X86-NEXT:    vandps %xmm2, %xmm3, %xmm2
+; X86-NEXT:    vcmpunordss %xmm4, %xmm4, %xmm0
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
 ; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
-; X86-NEXT:    vorps %xmm4, %xmm1, %xmm3
+; X86-NEXT:    vorps %xmm5, %xmm1, %xmm3
 ; X86-NEXT:    vandps %xmm2, %xmm3, %xmm2
 ; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
@@ -2889,10 +2875,16 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    shll $16, %ebp
-; X86-NEXT:    vmovd %ebp, %xmm0
-; X86-NEXT:    shll $16, %ebx
-; X86-NEXT:    vmovd %ebx, %xmm1
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm1
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm3
 ; X86-NEXT:    vandps %xmm2, %xmm3, %xmm2
@@ -2903,10 +2895,16 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vmovss %xmm0, (%esp)
-; X86-NEXT:    shll $16, %edi
-; X86-NEXT:    vmovd %edi, %xmm0
-; X86-NEXT:    shll $16, %esi
-; X86-NEXT:    vmovd %esi, %xmm1
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm1
+; X86-NEXT:    vpextrw $0, %xmm1, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm3
 ; X86-NEXT:    vandps %xmm2, %xmm3, %xmm2
@@ -2914,21 +2912,17 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
-; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vmovd %xmm0, (%esp)
-; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm1, (%esp)
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; X86-NEXT:    addl $68, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vinsertps $28, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; X86-NEXT:    # xmm0 = xmm0[0],mem[0],zero,zero
+; X86-NEXT:    addl $84, %esp
 ; X86-NEXT:    retl
   %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
   ret <4 x bfloat> %r
@@ -3328,10 +3322,8 @@ define fp128 @test_fmaximumnum_fp128(fp128 %x, fp128 %y) nounwind {
 ; X86:       # %bb.0: # %start
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovups 24(%ebp), %ymm0
 ; X86-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3339,11 +3331,10 @@ define fp128 @test_fmaximumnum_fp128(fp128 %x, fp128 %y) nounwind {
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll fmaximum_numl
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 start:
@@ -3518,10 +3509,8 @@ define fp128 @test_fminimumnum_fp128(fp128 %x, fp128 %y) nounwind {
 ; X86:       # %bb.0: # %start
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovups 24(%ebp), %ymm0
 ; X86-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3529,11 +3518,10 @@ define fp128 @test_fminimumnum_fp128(fp128 %x, fp128 %y) nounwind {
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll fminimum_numl
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    vmovaps %xmm0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 start:
diff --git a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
index cbdfa32ed4627..4b0ace5490427 100644
--- a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
+++ b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
@@ -9,74 +9,65 @@
 define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 {
 ; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f32:
 ; SOFT-FLOAT-32:       # %bb.0:
-; SOFT-FLOAT-32-NEXT:    pushl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -8
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    pushl %esi
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    popl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA-LABEL: fmuladd_intrinsic_f32:
 ; SOFT-FLOAT-32-FMA:       # %bb.0:
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_intrinsic_f32:
 ; SOFT-FLOAT-32-FMA4:       # %bb.0:
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    retl
 ;
 ; SOFT-FLOAT-64-LABEL: fmuladd_intrinsic_f32:
@@ -127,29 +118,30 @@ define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 {
 define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-LABEL: fmuladd_intrinsic_f64:
 ; SOFT-FLOAT-32:       # %bb.0:
-; SOFT-FLOAT-32-NEXT:    pushl %edi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
 ; SOFT-FLOAT-32-NEXT:    pushl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 12
-; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -12
-; SOFT-FLOAT-32-NEXT:    .cfi_offset %edi, -8
+; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
+; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -8
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    pushl %edi
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
+; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %eax
@@ -158,36 +150,35 @@ define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-NEXT:    popl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-NEXT:    popl %edi
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA-LABEL: fmuladd_intrinsic_f64:
 ; SOFT-FLOAT-32-FMA:       # %bb.0:
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 12
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -12
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %edi, -8
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -8
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
@@ -196,36 +187,35 @@ define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA-NEXT:    popl %edi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_intrinsic_f64:
 ; SOFT-FLOAT-32-FMA4:       # %bb.0:
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 12
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -12
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %edi, -8
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -8
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
@@ -234,8 +224,6 @@ define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA4-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA4-NEXT:    popl %edi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    retl
 ;
@@ -287,74 +275,65 @@ define double @fmuladd_intrinsic_f64(double %a, double %b, double %c) #0 {
 define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 {
 ; SOFT-FLOAT-32-LABEL: fmuladd_contract_f32:
 ; SOFT-FLOAT-32:       # %bb.0:
-; SOFT-FLOAT-32-NEXT:    pushl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -8
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    pushl %esi
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    popl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA-LABEL: fmuladd_contract_f32:
 ; SOFT-FLOAT-32-FMA:       # %bb.0:
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_contract_f32:
 ; SOFT-FLOAT-32-FMA4:       # %bb.0:
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    retl
 ;
 ; SOFT-FLOAT-64-LABEL: fmuladd_contract_f32:
@@ -406,29 +385,30 @@ define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 {
 define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-LABEL: fmuladd_contract_f64:
 ; SOFT-FLOAT-32:       # %bb.0:
-; SOFT-FLOAT-32-NEXT:    pushl %edi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
 ; SOFT-FLOAT-32-NEXT:    pushl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 12
-; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -12
-; SOFT-FLOAT-32-NEXT:    .cfi_offset %edi, -8
+; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
+; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -8
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    pushl %edi
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
+; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %eax
@@ -437,36 +417,35 @@ define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-NEXT:    popl %esi
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-NEXT:    popl %edi
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA-LABEL: fmuladd_contract_f64:
 ; SOFT-FLOAT-32-FMA:       # %bb.0:
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 12
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -12
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %edi, -8
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -8
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
@@ -475,36 +454,35 @@ define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA-NEXT:    popl %edi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    retl
 ;
 ; SOFT-FLOAT-32-FMA4-LABEL: fmuladd_contract_f64:
 ; SOFT-FLOAT-32-FMA4:       # %bb.0:
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 12
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -12
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %edi, -8
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -8
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
@@ -513,8 +491,6 @@ define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 {
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA4-NEXT:    popl %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 8
-; SOFT-FLOAT-32-FMA4-NEXT:    popl %edi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    retl
 ;
@@ -575,85 +551,85 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 20
-; SOFT-FLOAT-32-NEXT:    pushl %eax
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 24
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -20
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %edi, -16
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %ebx, -12
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %ebp, -8
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; SOFT-FLOAT-32-NEXT:    pushl %edi
+; SOFT-FLOAT-32-NEXT:    movl %eax, %esi
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, %ebx
-; SOFT-FLOAT-32-NEXT:    pushl %ebp
+; SOFT-FLOAT-32-NEXT:    movl %eax, %edi
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, %ebp
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, %ebp
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %edi
+; SOFT-FLOAT-32-NEXT:    pushl %ebp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, %ebp
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-NEXT:    pushl %edi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __addsf3
-; SOFT-FLOAT-32-NEXT:    addl $8, %esp
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, %ebx
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %esi, (%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-NEXT:    movl %eax, 12(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %ebp, 8(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %ebx, 4(%esi)
 ; SOFT-FLOAT-32-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-NEXT:    movl %eax, 12(%esi)
-; SOFT-FLOAT-32-NEXT:    movl %ebx, 8(%esi)
-; SOFT-FLOAT-32-NEXT:    movl %edi, 4(%esi)
-; SOFT-FLOAT-32-NEXT:    movl %ebp, (%esi)
+; SOFT-FLOAT-32-NEXT:    movl %eax, (%esi)
 ; SOFT-FLOAT-32-NEXT:    movl %esi, %eax
-; SOFT-FLOAT-32-NEXT:    addl $4, %esp
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 20
 ; SOFT-FLOAT-32-NEXT:    popl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-NEXT:    popl %edi
@@ -674,85 +650,85 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 20
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 24
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -20
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %edi, -16
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %ebx, -12
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %ebp, -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %esi
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebx
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebp
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %edi
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebp
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebp
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebp
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __addsf3
-; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebx
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %esi, (%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, 12(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ebp, 8(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ebx, 4(%esi)
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, 12(%esi)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %ebx, 8(%esi)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edi, 4(%esi)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %ebp, (%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, (%esi)
 ; SOFT-FLOAT-32-FMA-NEXT:    movl %esi, %eax
-; SOFT-FLOAT-32-FMA-NEXT:    addl $4, %esp
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 20
 ; SOFT-FLOAT-32-FMA-NEXT:    popl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-FMA-NEXT:    popl %edi
@@ -773,85 +749,85 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 20
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 24
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -20
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %edi, -16
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %ebx, -12
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %ebp, -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebx
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebp
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %edi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebp
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __mulsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebp
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebp
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __addsf3
-; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebx
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %esi, (%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, 12(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebp, 8(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebx, 4(%esi)
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __addsf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $8, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, 12(%esi)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebx, 8(%esi)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edi, 4(%esi)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebp, (%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, (%esi)
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl %esi, %eax
-; SOFT-FLOAT-32-FMA4-NEXT:    addl $4, %esp
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 20
 ; SOFT-FLOAT-32-FMA4-NEXT:    popl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-FMA4-NEXT:    popl %edi
@@ -1111,69 +1087,83 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 20
-; SOFT-FLOAT-32-NEXT:    subl $16, %esp
-; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 36
+; SOFT-FLOAT-32-NEXT:    subl $20, %esp
+; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 40
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %esi, -20
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %edi, -16
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %ebx, -12
 ; SOFT-FLOAT-32-NEXT:    .cfi_offset %ebp, -8
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; SOFT-FLOAT-32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; SOFT-FLOAT-32-NEXT:    pushl %ebp
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %esi
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %edi
+; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    movl %eax, %esi
-; SOFT-FLOAT-32-NEXT:    movl %edx, %ebp
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SOFT-FLOAT-32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-NEXT:    movl %edx, %ebx
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-NEXT:    movl %edx, %ebp
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
@@ -1182,58 +1172,57 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-32-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; SOFT-FLOAT-32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %ebx
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %edi
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    calll __adddf3
-; SOFT-FLOAT-32-NEXT:    addl $16, %esp
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-NEXT:    movl %edx, %ebx
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; SOFT-FLOAT-32-NEXT:    movl %edx, %edi
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl %ebp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl %esi
+; SOFT-FLOAT-32-NEXT:    pushl %ebx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-NEXT:    movl %eax, %ebp
-; SOFT-FLOAT-32-NEXT:    movl %edx, %esi
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-NEXT:    movl %edx, %ebp
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-NEXT:    calll __adddf3
-; SOFT-FLOAT-32-NEXT:    addl $16, %esp
-; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SOFT-FLOAT-32-NEXT:    movl %edx, 28(%ecx)
-; SOFT-FLOAT-32-NEXT:    movl %eax, 24(%ecx)
-; SOFT-FLOAT-32-NEXT:    movl %esi, 20(%ecx)
-; SOFT-FLOAT-32-NEXT:    movl %ebp, 16(%ecx)
-; SOFT-FLOAT-32-NEXT:    movl %ebx, 12(%ecx)
-; SOFT-FLOAT-32-NEXT:    movl %edi, 8(%ecx)
-; SOFT-FLOAT-32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; SOFT-FLOAT-32-NEXT:    movl %eax, 4(%ecx)
+; SOFT-FLOAT-32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; SOFT-FLOAT-32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; SOFT-FLOAT-32-NEXT:    movl %ecx, (%esp)
+; SOFT-FLOAT-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-NEXT:    movl %edx, 28(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %eax, 24(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %ebp, 20(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %ebx, 16(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %edi, 12(%esi)
 ; SOFT-FLOAT-32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; SOFT-FLOAT-32-NEXT:    movl %eax, (%ecx)
-; SOFT-FLOAT-32-NEXT:    movl %ecx, %eax
+; SOFT-FLOAT-32-NEXT:    movl %eax, 8(%esi)
+; SOFT-FLOAT-32-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-NEXT:    addl $16, %esp
+; SOFT-FLOAT-32-NEXT:    .cfi_adjust_cfa_offset -16
+; SOFT-FLOAT-32-NEXT:    movl %edx, 4(%esi)
+; SOFT-FLOAT-32-NEXT:    movl %eax, (%esi)
+; SOFT-FLOAT-32-NEXT:    movl %esi, %eax
+; SOFT-FLOAT-32-NEXT:    addl $20, %esp
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 20
 ; SOFT-FLOAT-32-NEXT:    popl %esi
 ; SOFT-FLOAT-32-NEXT:    .cfi_def_cfa_offset 16
@@ -1255,69 +1244,83 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 20
-; SOFT-FLOAT-32-FMA-NEXT:    subl $16, %esp
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 36
+; SOFT-FLOAT-32-FMA-NEXT:    subl $20, %esp
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 40
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %esi, -20
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %edi, -16
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %ebx, -12
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_offset %ebp, -8
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebp
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %esi
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %ebp
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %ebx
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %ebp
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
@@ -1326,58 +1329,57 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebx
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    calll __adddf3
-; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %ebx
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %edi
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ebx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebp
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %esi
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, %ebp
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA-NEXT:    calll __adddf3
-; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
-; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, 28(%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, 24(%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %esi, 20(%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %ebp, 16(%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %ebx, 12(%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %edi, 8(%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl (%esp), %eax # 4-byte Reload
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, 4(%ecx)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ecx, (%esp)
+; SOFT-FLOAT-32-FMA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, 28(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, 24(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ebp, 20(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %ebx, 16(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edi, 12(%esi)
 ; SOFT-FLOAT-32-FMA-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, (%ecx)
-; SOFT-FLOAT-32-FMA-NEXT:    movl %ecx, %eax
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, 8(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-FMA-NEXT:    addl $16, %esp
+; SOFT-FLOAT-32-FMA-NEXT:    .cfi_adjust_cfa_offset -16
+; SOFT-FLOAT-32-FMA-NEXT:    movl %edx, 4(%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %eax, (%esi)
+; SOFT-FLOAT-32-FMA-NEXT:    movl %esi, %eax
+; SOFT-FLOAT-32-FMA-NEXT:    addl $20, %esp
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 20
 ; SOFT-FLOAT-32-FMA-NEXT:    popl %esi
 ; SOFT-FLOAT-32-FMA-NEXT:    .cfi_def_cfa_offset 16
@@ -1399,69 +1401,83 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 16
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 20
-; SOFT-FLOAT-32-FMA4-NEXT:    subl $16, %esp
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 36
+; SOFT-FLOAT-32-FMA4-NEXT:    subl $20, %esp
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 40
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %esi, -20
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %edi, -16
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %ebx, -12
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_offset %ebp, -8
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebp
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %ebp
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %ebx
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %ebp
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __muldf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
@@ -1470,58 +1486,57 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebx
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    calll __adddf3
-; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %edi
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %ebx
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %edi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ebx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebp
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %esi
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, %ebx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, %ebp
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %ecx
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
-; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    pushl %eax
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset 4
 ; SOFT-FLOAT-32-FMA4-NEXT:    calll __adddf3
-; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
-; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, 28(%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, 24(%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %esi, 20(%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebp, 16(%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebx, 12(%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %edi, 8(%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl (%esp), %eax # 4-byte Reload
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, 4(%ecx)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ecx, (%esp)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, 28(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, 24(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebp, 20(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %ebx, 16(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edi, 12(%esi)
 ; SOFT-FLOAT-32-FMA4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, (%ecx)
-; SOFT-FLOAT-32-FMA4-NEXT:    movl %ecx, %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, 8(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    calll __adddf3
 ; SOFT-FLOAT-32-FMA4-NEXT:    addl $16, %esp
+; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_adjust_cfa_offset -16
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %edx, 4(%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %eax, (%esi)
+; SOFT-FLOAT-32-FMA4-NEXT:    movl %esi, %eax
+; SOFT-FLOAT-32-FMA4-NEXT:    addl $20, %esp
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 20
 ; SOFT-FLOAT-32-FMA4-NEXT:    popl %esi
 ; SOFT-FLOAT-32-FMA4-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/fold-and-shift.ll b/llvm/test/CodeGen/X86/fold-and-shift.ll
index 985d7c6c82f06..68b40fc80734c 100644
--- a/llvm/test/CodeGen/X86/fold-and-shift.ll
+++ b/llvm/test/CodeGen/X86/fold-and-shift.ll
@@ -6,9 +6,9 @@ define i32 @t1(ptr %X, i32 %i) {
 ; X86-LABEL: t1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t1:
@@ -28,9 +28,9 @@ define i32 @t2(ptr %X, i32 %i) {
 ; X86-LABEL: t2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t2:
@@ -55,12 +55,12 @@ entry:
 define i32 @t3(ptr %i.ptr, ptr %arr) {
 ; X86-LABEL: t3:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shrl $11, %edx
-; X86-NEXT:    addl (%ecx,%edx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $11, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl (%edx,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t3:
@@ -85,13 +85,13 @@ entry:
 define i32 @t4(ptr %i.ptr, ptr %arr) {
 ; X86-LABEL: t4:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shrl $11, %edx
-; X86-NEXT:    addl (%ecx,%edx,4), %eax
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $11, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl (%edx,%ecx,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t4:
@@ -118,9 +118,9 @@ define i8 @t5(ptr %X, i32 %i) {
 ; X86-LABEL: t5:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $-14, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $-14, %ecx
-; X86-NEXT:    movzbl (%eax,%ecx,4), %eax
+; X86-NEXT:    movzbl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t5:
@@ -141,10 +141,10 @@ entry:
 define i8 @t6(ptr %X, i32 %i) {
 ; X86-LABEL: t6:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $-255, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl (%eax,%ecx,4), %eax
+; X86-NEXT:    movl $-255, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t6:
diff --git a/llvm/test/CodeGen/X86/fold-load.ll b/llvm/test/CodeGen/X86/fold-load.ll
index 580278c2a06f2..b8f966de63ada 100644
--- a/llvm/test/CodeGen/X86/fold-load.ll
+++ b/llvm/test/CodeGen/X86/fold-load.ll
@@ -44,11 +44,11 @@ cond_next:		; preds = %entry
 define i32 @test2(ptr %P, ptr %Q) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzwl (%eax), %edx
-; CHECK-NEXT:    movzbl %dl, %eax
-; CHECK-NEXT:    movw %dx, (%ecx)
+; CHECK-NEXT:    movzwl (%eax), %ecx
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movw %cx, (%edx)
 ; CHECK-NEXT:    retl
   %A = load i16, ptr %P, align 4                      ; <i16> [#uses=11]
   %C = zext i16 %A to i32                         ; <i32> [#uses=1]
@@ -68,8 +68,8 @@ define i1 @test3(ptr %P, ptr %Q) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    xorl (%ecx), %eax
 ; CHECK-NEXT:    testl $89947, %eax # imm = 0x15F5B
 ; CHECK-NEXT:    je .LBB2_2
diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 5dcb1d63207d1..d2fa89d789d2b 100644
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -26,7 +26,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X86-NEXT:    retl
 ; X86-NEXT:  LBB0_3: ## %forbody
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $88, %esp
+; X86-NEXT:    subl $104, %esp
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
 ; X86-NEXT:    movaps {{.*#+}} xmm1 = [1.28E+2,1.28E+2,1.28E+2,1.28E+2]
 ; X86-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    cvttps2dq %xmm1, %xmm0
@@ -51,45 +55,43 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X86-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X86-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X86-NEXT:    mulps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X86-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X86-NEXT:    cmpunordps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X86-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, (%esp)
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorps %xmm3, %xmm3
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
 ; X86-NEXT:    calll *%esi
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X86-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X86-NEXT:    pxor %xmm1, %xmm1
-; X86-NEXT:    psubd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
 ; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X86-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    psubd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
 ; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X86-NEXT:    pxor %xmm0, %xmm0
-; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X86-NEXT:    orps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; X86-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X86-NEXT:    xorps %xmm3, %xmm3
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X86-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 3dc083fbd673b..8abd644d61abb 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -20,97 +20,103 @@ define i64 @fn1() #0 {
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    subl $24, %esp
 ; CHECK-NEXT:    .cfi_offset %esi, -20
 ; CHECK-NEXT:    .cfi_offset %edi, -16
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
-; CHECK-NEXT:    movl $-1028477379, %eax # imm = 0xC2B2AE3D
-; CHECK-NEXT:    movl $668265295, %ebx # imm = 0x27D4EB4F
-; CHECK-NEXT:    movl a, %edi
-; CHECK-NEXT:    cmpl $0, (%edi)
+; CHECK-NEXT:    movl $-1028477379, %edi # imm = 0xC2B2AE3D
+; CHECK-NEXT:    movl $668265295, %esi # imm = 0x27D4EB4F
+; CHECK-NEXT:    movl a, %ebx
+; CHECK-NEXT:    cmpl $0, (%ebx)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    movl 8(%edi), %esi
-; CHECK-NEXT:    movl 12(%edi), %edx
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shldl $1, %esi, %eax
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    leal (%esi,%esi), %ecx
-; CHECK-NEXT:    orl %esi, %ecx
+; CHECK-NEXT:    movl 16(%ebx), %edx
+; CHECK-NEXT:    movl 20(%ebx), %eax
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    shldl $2, %edx, %edi
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    shrdl $1, %eax, %ecx
 ; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%edi), %ecx
-; CHECK-NEXT:    movl 20(%edi), %esi
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    shldl $2, %ecx, %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    leal (,%ecx,4), %edx
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shrdl $1, %esi, %ecx
-; CHECK-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT:    shrl %esi
-; CHECK-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    movl 8(%ebx), %edi
+; CHECK-NEXT:    movl 12(%ebx), %esi
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    shldl $1, %edi, %ecx
+; CHECK-NEXT:    orl %esi, %ecx
+; CHECK-NEXT:    shll $2, %edx
+; CHECK-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-NEXT:    leal (%edi,%edi), %esi
+; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    adcl %eax, %ecx
 ; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    adcl %eax, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 24(%edi), %eax
+; CHECK-NEXT:    movl 24(%ebx), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl $-1028477379, %ecx # imm = 0xC2B2AE3D
 ; CHECK-NEXT:    imull %eax, %ecx
-; CHECK-NEXT:    mull %ebx
-; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    movl $668265295, %edx # imm = 0x27D4EB4F
+; CHECK-NEXT:    mull %edx
+; CHECK-NEXT:    movl %eax, %edi
 ; CHECK-NEXT:    addl %ecx, %edx
-; CHECK-NEXT:    movl 28(%edi), %edi
-; CHECK-NEXT:    imull %edi, %ebx
-; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:    movl 28(%ebx), %ebx
+; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl $668265295, %eax # imm = 0x27D4EB4F
+; CHECK-NEXT:    imull %ebx, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    shrdl $3, %eax, %edi
 ; CHECK-NEXT:    movl $1336530590, %edx # imm = 0x4FA9D69E
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    mull %edx
-; CHECK-NEXT:    imull $-2056954758, %ecx, %ecx # imm = 0x85655C7A
-; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT:    addl %ecx, %edx
-; CHECK-NEXT:    shrdl $3, %ebx, %esi
-; CHECK-NEXT:    sarl $3, %ebx
-; CHECK-NEXT:    orl %edx, %ebx
-; CHECK-NEXT:    orl %eax, %esi
-; CHECK-NEXT:    movl $-66860409, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    mull %ecx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    orl %edi, %ebx
+; CHECK-NEXT:    movl $-66860409, %edx # imm = 0xFC03CA87
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    mull %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    imull $326129324, %esi, %eax # imm = 0x137056AC
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT:    xorl %esi, %edi
 ; CHECK-NEXT:    movl %edi, b
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movl $-66860409, %edx # imm = 0xFC03CA87
 ; CHECK-NEXT:    mull %edx
 ; CHECK-NEXT:    imull $326129324, %edi, %esi # imm = 0x137056AC
 ; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    imull $326129324, %ebx, %edx # imm = 0x137056AC
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-NEXT:    imull $-2056954758, %ecx, %ecx # imm = 0x85655C7A
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT:    imull $1336530590, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT:    # imm = 0x4FA9D69E
+; CHECK-NEXT:    addl %ecx, %edi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    sarl $3, %ecx
+; CHECK-NEXT:    orl %ecx, %edi
+; CHECK-NEXT:    imull $-66860409, %edi, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; CHECK-NEXT:    movl %ecx, b+4
 ; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.else
-; CHECK-NEXT:    xorl b+4, %eax
-; CHECK-NEXT:    xorl b, %ebx
+; CHECK-NEXT:    xorl b, %esi
 ; CHECK-NEXT:    movl $1419758215, %ecx # imm = 0x549FCA87
-; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    mull %ecx
-; CHECK-NEXT:    imull $93298681, %ebx, %esi # imm = 0x58F9FF9
+; CHECK-NEXT:    imull $93298681, %esi, %esi # imm = 0x58F9FF9
 ; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    xorl b+4, %edi
 ; CHECK-NEXT:    imull $1419758215, %edi, %ecx # imm = 0x549FCA87
 ; CHECK-NEXT:  .LBB0_3: # %if.end
 ; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    addl $-1028477341, %eax # imm = 0xC2B2AE63
-; CHECK-NEXT:    adcl $-2048144777, %ecx # imm = 0x85EBCA77
 ; CHECK-NEXT:    movl %eax, b
+; CHECK-NEXT:    adcl $-2048144777, %ecx # imm = 0x85EBCA77
 ; CHECK-NEXT:    movl %ecx, b+4
-; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    addl $24, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fp-arith.ll b/llvm/test/CodeGen/X86/fp-arith.ll
index 2dd1f05ba3763..09fa5741ce28b 100644
--- a/llvm/test/CodeGen/X86/fp-arith.ll
+++ b/llvm/test/CodeGen/X86/fp-arith.ll
@@ -11,9 +11,9 @@ define x86_fp80 @fiadd_fp80_i16(x86_fp80 %a0, i16 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fiadds {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -35,10 +35,10 @@ define x86_fp80 @fiadd_fp80_i16_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fiadds {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -62,9 +62,9 @@ define x86_fp80 @fiadd_fp80_i32(x86_fp80 %a0, i32 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fiaddl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -86,10 +86,10 @@ define x86_fp80 @fiadd_fp80_i32_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fiaddl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -117,9 +117,9 @@ define x86_fp80 @fisub_fp80_i16(x86_fp80 %a0, i16 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -141,10 +141,10 @@ define x86_fp80 @fisub_fp80_i16_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -168,9 +168,9 @@ define x86_fp80 @fisub_fp80_i32(x86_fp80 %a0, i32 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -192,10 +192,10 @@ define x86_fp80 @fisub_fp80_i32_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -223,9 +223,9 @@ define x86_fp80 @fisubr_fp80_i16(x86_fp80 %a0, i16 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubrs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -247,10 +247,10 @@ define x86_fp80 @fisubr_fp80_i16_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubrs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -274,9 +274,9 @@ define x86_fp80 @fisubr_fp80_i32(x86_fp80 %a0, i32 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubrl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -298,10 +298,10 @@ define x86_fp80 @fisubr_fp80_i32_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fisubrl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -329,9 +329,9 @@ define x86_fp80 @fimul_fp80_i16(x86_fp80 %a0, i16 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fimuls {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -353,10 +353,10 @@ define x86_fp80 @fimul_fp80_i16_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fimuls {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -380,9 +380,9 @@ define x86_fp80 @fimul_fp80_i32(x86_fp80 %a0, i32 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fimull (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -404,10 +404,10 @@ define x86_fp80 @fimul_fp80_i32_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fimull (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -435,9 +435,9 @@ define x86_fp80 @fidiv_fp80_i16(x86_fp80 %a0, i16 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -459,10 +459,10 @@ define x86_fp80 @fidiv_fp80_i16_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -486,9 +486,9 @@ define x86_fp80 @fidiv_fp80_i32(x86_fp80 %a0, i32 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -510,10 +510,10 @@ define x86_fp80 @fidiv_fp80_i32_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -541,9 +541,9 @@ define x86_fp80 @fidivr_fp80_i16(x86_fp80 %a0, i16 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivrs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -565,10 +565,10 @@ define x86_fp80 @fidivr_fp80_i16_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivrs {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -592,9 +592,9 @@ define x86_fp80 @fidivr_fp80_i32(x86_fp80 %a0, i32 %a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivrl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -616,10 +616,10 @@ define x86_fp80 @fidivr_fp80_i32_ld(x86_fp80 %a0, ptr%a1) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fidivrl (%esp)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 5d69a217fb402..6d495c386a074 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -75,10 +75,10 @@ define double @f2(double %a) #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
-; X86-SSE-NEXT:    subsd %xmm1, %xmm0
-; X86-SSE-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    subsd %xmm0, %xmm1
+; X86-SSE-NEXT:    movsd %xmm1, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
 ; X86-SSE-NEXT:    wait
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -1300,10 +1300,10 @@ define i32 @f20u(double %x) #0 {
 ;
 ; X86-SSE-LABEL: f20u:
 ; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = [2.147483648E+9,0.0E+0]
 ; X86-SSE-NEXT:    comisd %xmm0, %xmm2
-; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    ja .LBB24_2
 ; X86-SSE-NEXT:  # %bb.1: # %entry
 ; X86-SSE-NEXT:    movapd %xmm2, %xmm1
@@ -2477,9 +2477,9 @@ define double @uifdl(i64 %x) #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $28, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    shrl $31, %eax
 ; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -2699,9 +2699,9 @@ define float @uiffl(i64 %x) #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $20, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 24
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    shrl $31, %eax
 ; X86-SSE-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
diff --git a/llvm/test/CodeGen/X86/fp-stack-2results.ll b/llvm/test/CodeGen/X86/fp-stack-2results.ll
index ef0efa5a5a36a..c4c722de09a84 100644
--- a/llvm/test/CodeGen/X86/fp-stack-2results.ll
+++ b/llvm/test/CodeGen/X86/fp-stack-2results.ll
@@ -41,20 +41,15 @@ define %0 @test2() {
 define void @call1(ptr%P1, ptr%P2) {
 ; i686-LABEL: call1:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
+; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %esi, -8
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    calll test at PLT
-; i686-NEXT:    fstpt (%edi)
 ; i686-NEXT:    fstpt (%esi)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    fstpt (%eax)
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
-; i686-NEXT:    popl %edi
 ; i686-NEXT:    .cfi_def_cfa_offset 4
 ; i686-NEXT:    retl
 ;
@@ -93,21 +88,16 @@ define void @call1(ptr%P1, ptr%P2) {
 define void @call2(ptr%P1, ptr%P2) {
 ; i686-LABEL: call2:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
+; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %esi, -8
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; i686-NEXT:    calll test at PLT
 ; i686-NEXT:    fxch %st(1)
-; i686-NEXT:    fstpt (%edi)
 ; i686-NEXT:    fstpt (%esi)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    fstpt (%eax)
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
-; i686-NEXT:    popl %edi
 ; i686-NEXT:    .cfi_def_cfa_offset 4
 ; i686-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll b/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll
index f4928d068bcf7..65a3cd4d97a4f 100644
--- a/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll
+++ b/llvm/test/CodeGen/X86/fp-stack-ret-conv.ll
@@ -7,19 +7,15 @@ target triple = "i686-apple-darwin8"
 define void @test(ptr%b) {
 ; CHECK-LABEL: test:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %esi, -8
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    calll _foo
 ; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
-; CHECK-NEXT:    movsd %xmm0, (%esi)
-; CHECK-NEXT:    addl $8, %esp
-; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movsd %xmm0, (%eax)
+; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:
 	%tmp13 = tail call double @foo()
diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index 74291fbb75e81..de6ce50b51fac 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -75,7 +75,6 @@ define float @frem(float %x, float %y) #0 {
 ; CHECK-NEXT:    subl $20, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
 ; CHECK-NEXT:    wait
@@ -129,7 +128,6 @@ define float @pow(float %x, float %y) #0 {
 ; CHECK-NEXT:    subl $20, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
 ; CHECK-NEXT:    wait
@@ -234,7 +232,6 @@ define float @atan2(float %x, float %y) #0 {
 ; CHECK-NEXT:    subl $20, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
 ; CHECK-NEXT:    wait
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
index 6a6b86e8efa7c..8ca20bedaef13 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
@@ -47,10 +47,10 @@ define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_oeq_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnel %eax, %ecx
 ; X86-FP16-NEXT:    cmovpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
@@ -110,10 +110,10 @@ define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ogt_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmoval %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -171,10 +171,10 @@ define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_oge_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovael %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -234,10 +234,10 @@ define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_olt_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmoval %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -297,10 +297,10 @@ define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ole_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovael %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -358,10 +358,10 @@ define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_one_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -419,10 +419,10 @@ define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ord_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -480,10 +480,10 @@ define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ueq_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -543,10 +543,10 @@ define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ugt_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -606,10 +606,10 @@ define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_uge_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -667,10 +667,10 @@ define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ult_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -728,10 +728,10 @@ define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ule_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -791,10 +791,10 @@ define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_une_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnel %eax, %ecx
 ; X86-FP16-NEXT:    cmovpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
@@ -854,10 +854,10 @@ define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_uno_q:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -917,10 +917,10 @@ define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_oeq_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnel %eax, %ecx
 ; X86-FP16-NEXT:    cmovpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
@@ -980,10 +980,10 @@ define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ogt_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmoval %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1041,10 +1041,10 @@ define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_oge_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovael %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1104,10 +1104,10 @@ define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_olt_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmoval %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1167,10 +1167,10 @@ define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ole_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovael %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1228,10 +1228,10 @@ define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_one_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1289,10 +1289,10 @@ define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ord_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1350,10 +1350,10 @@ define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ueq_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1413,10 +1413,10 @@ define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ugt_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1476,10 +1476,10 @@ define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_uge_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1537,10 +1537,10 @@ define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ult_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1598,10 +1598,10 @@ define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_ule_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovbel %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
@@ -1661,10 +1661,10 @@ define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_une_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovnel %eax, %ecx
 ; X86-FP16-NEXT:    cmovpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
@@ -1724,10 +1724,10 @@ define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
 ;
 ; X86-FP16-LABEL: test_f16_uno_s:
 ; X86-FP16:       # %bb.0:
-; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
-; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-FP16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-FP16-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-FP16-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
 ; X86-FP16-NEXT:    cmovpl %eax, %ecx
 ; X86-FP16-NEXT:    movl (%ecx), %eax
 ; X86-FP16-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
index e3e2b6225a7ba..c454196271bc8 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
@@ -11,10 +11,10 @@
 define i32 @test_f32_oeq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_oeq_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -30,10 +30,10 @@ define i32 @test_f32_oeq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_oeq_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -54,27 +54,27 @@ define i32 @test_f32_oeq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    fucompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB0_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB0_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB0_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f32_oeq_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -89,10 +89,10 @@ define i32 @test_f32_oeq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ogt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ogt_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -106,10 +106,10 @@ define i32 @test_f32_ogt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ogt_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -142,13 +142,13 @@ define i32 @test_f32_ogt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ogt_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -162,10 +162,10 @@ define i32 @test_f32_ogt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_oge_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -179,10 +179,10 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_oge_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -215,13 +215,13 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_oge_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -235,10 +235,10 @@ define i32 @test_f32_oge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_olt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_olt_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -252,10 +252,10 @@ define i32 @test_f32_olt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_olt_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -288,13 +288,13 @@ define i32 @test_f32_olt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_olt_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -308,10 +308,10 @@ define i32 @test_f32_olt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ole_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -325,10 +325,10 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ole_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -361,13 +361,13 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ole_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -381,10 +381,10 @@ define i32 @test_f32_ole_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_one_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -398,10 +398,10 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_one_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -434,13 +434,13 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_one_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -454,10 +454,10 @@ define i32 @test_f32_one_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ord_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -471,10 +471,10 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ord_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -507,13 +507,13 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ord_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -527,10 +527,10 @@ define i32 @test_f32_ord_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ueq_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -544,10 +544,10 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ueq_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -580,13 +580,13 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ueq_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -600,10 +600,10 @@ define i32 @test_f32_ueq_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ugt_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -617,10 +617,10 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ugt_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -653,13 +653,13 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ugt_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -673,10 +673,10 @@ define i32 @test_f32_ugt_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_uge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_uge_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -690,10 +690,10 @@ define i32 @test_f32_uge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_uge_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -726,13 +726,13 @@ define i32 @test_f32_uge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_uge_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -746,10 +746,10 @@ define i32 @test_f32_uge_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ult_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -763,10 +763,10 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ult_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -799,13 +799,13 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ult_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -819,10 +819,10 @@ define i32 @test_f32_ult_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ule_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ule_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -836,10 +836,10 @@ define i32 @test_f32_ule_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ule_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -872,13 +872,13 @@ define i32 @test_f32_ule_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ule_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -892,10 +892,10 @@ define i32 @test_f32_ule_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_une_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_une_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -911,10 +911,10 @@ define i32 @test_f32_une_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_une_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -935,27 +935,27 @@ define i32 @test_f32_une_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    fucompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB12_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB12_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB12_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f32_une_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -970,10 +970,10 @@ define i32 @test_f32_une_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_uno_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -987,10 +987,10 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_uno_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vucomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1023,13 +1023,13 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_uno_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1043,10 +1043,10 @@ define i32 @test_f32_uno_q(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f64_oeq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_oeq_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -1062,10 +1062,10 @@ define i32 @test_f64_oeq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_oeq_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -1086,27 +1086,27 @@ define i32 @test_f64_oeq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fucompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB14_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB14_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB14_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f64_oeq_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -1121,10 +1121,10 @@ define i32 @test_f64_oeq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ogt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ogt_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1138,10 +1138,10 @@ define i32 @test_f64_ogt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ogt_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1174,13 +1174,13 @@ define i32 @test_f64_ogt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ogt_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1194,10 +1194,10 @@ define i32 @test_f64_ogt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_oge_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1211,10 +1211,10 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_oge_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1247,13 +1247,13 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_oge_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1267,10 +1267,10 @@ define i32 @test_f64_oge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_olt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_olt_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1284,10 +1284,10 @@ define i32 @test_f64_olt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_olt_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1320,13 +1320,13 @@ define i32 @test_f64_olt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_olt_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1340,10 +1340,10 @@ define i32 @test_f64_olt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ole_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1357,10 +1357,10 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ole_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1393,13 +1393,13 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ole_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1413,10 +1413,10 @@ define i32 @test_f64_ole_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_one_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1430,10 +1430,10 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_one_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1466,13 +1466,13 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_one_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1486,10 +1486,10 @@ define i32 @test_f64_one_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ord_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1503,10 +1503,10 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ord_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1539,13 +1539,13 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ord_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1559,10 +1559,10 @@ define i32 @test_f64_ord_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ueq_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1576,10 +1576,10 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ueq_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1612,13 +1612,13 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ueq_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1632,10 +1632,10 @@ define i32 @test_f64_ueq_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ugt_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1649,10 +1649,10 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ugt_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1685,13 +1685,13 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ugt_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1705,10 +1705,10 @@ define i32 @test_f64_ugt_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_uge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_uge_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1722,10 +1722,10 @@ define i32 @test_f64_uge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_uge_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1758,13 +1758,13 @@ define i32 @test_f64_uge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_uge_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1778,10 +1778,10 @@ define i32 @test_f64_uge_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ult_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1795,10 +1795,10 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ult_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1831,13 +1831,13 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ult_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1851,10 +1851,10 @@ define i32 @test_f64_ult_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ule_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ule_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -1868,10 +1868,10 @@ define i32 @test_f64_ule_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ule_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -1904,13 +1904,13 @@ define i32 @test_f64_ule_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ule_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -1924,10 +1924,10 @@ define i32 @test_f64_ule_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_une_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_une_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -1943,10 +1943,10 @@ define i32 @test_f64_une_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_une_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -1967,27 +1967,27 @@ define i32 @test_f64_une_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fucompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB26_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB26_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB26_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f64_une_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -2002,10 +2002,10 @@ define i32 @test_f64_une_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_uno_q:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    ucomisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2019,10 +2019,10 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_uno_q:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vucomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2055,13 +2055,13 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_uno_q:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fucompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2075,10 +2075,10 @@ define i32 @test_f64_uno_q(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_oeq_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -2094,10 +2094,10 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_oeq_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -2118,27 +2118,27 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    fcompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB28_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB28_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB28_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f32_oeq_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -2153,10 +2153,10 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ogt_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2170,10 +2170,10 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ogt_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2206,13 +2206,13 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ogt_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2226,10 +2226,10 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_oge_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2243,10 +2243,10 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_oge_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2279,13 +2279,13 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_oge_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2299,10 +2299,10 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_olt_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2316,10 +2316,10 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_olt_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2352,13 +2352,13 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_olt_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2372,10 +2372,10 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ole_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2389,10 +2389,10 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ole_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2425,13 +2425,13 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ole_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2445,10 +2445,10 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_one_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2462,10 +2462,10 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_one_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2498,13 +2498,13 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_one_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2518,10 +2518,10 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ord_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2535,10 +2535,10 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ord_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2571,13 +2571,13 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ord_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2591,10 +2591,10 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ueq_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2607,11 +2607,11 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-64-NEXT:    retq
 ;
 ; AVX-32-LABEL: test_f32_ueq_s:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
+; AVX-32:       # %bb.0:
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2644,13 +2644,13 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ueq_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2664,10 +2664,10 @@ define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ugt_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2681,10 +2681,10 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ugt_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2717,13 +2717,13 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ugt_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2737,10 +2737,10 @@ define i32 @test_f32_ugt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_uge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_uge_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2754,10 +2754,10 @@ define i32 @test_f32_uge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_uge_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2790,13 +2790,13 @@ define i32 @test_f32_uge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_uge_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2810,10 +2810,10 @@ define i32 @test_f32_uge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ult_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2827,10 +2827,10 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ult_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2863,13 +2863,13 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ult_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2883,10 +2883,10 @@ define i32 @test_f32_ult_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_ule_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_ule_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -2900,10 +2900,10 @@ define i32 @test_f32_ule_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_ule_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -2936,13 +2936,13 @@ define i32 @test_f32_ule_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_ule_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -2956,10 +2956,10 @@ define i32 @test_f32_ule_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_une_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_une_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -2975,10 +2975,10 @@ define i32 @test_f32_une_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_une_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -2999,27 +2999,27 @@ define i32 @test_f32_une_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; X87-NEXT:    fcompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB40_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB40_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB40_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f32_une_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -3034,10 +3034,10 @@ define i32 @test_f32_une_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ; SSE-32-LABEL: test_f32_uno_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    comiss {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3051,10 +3051,10 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f32_uno_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vcomiss {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3087,13 +3087,13 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f32_uno_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3107,10 +3107,10 @@ define i32 @test_f32_uno_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
 define i32 @test_f64_oeq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_oeq_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -3126,10 +3126,10 @@ define i32 @test_f64_oeq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_oeq_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -3150,27 +3150,27 @@ define i32 @test_f64_oeq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fcompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB42_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB42_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB42_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f64_oeq_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -3185,10 +3185,10 @@ define i32 @test_f64_oeq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ogt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ogt_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3202,10 +3202,10 @@ define i32 @test_f64_ogt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ogt_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3238,13 +3238,13 @@ define i32 @test_f64_ogt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ogt_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3258,10 +3258,10 @@ define i32 @test_f64_ogt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_oge_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3275,10 +3275,10 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_oge_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3311,13 +3311,13 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_oge_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3331,10 +3331,10 @@ define i32 @test_f64_oge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_olt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_olt_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmoval %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3348,10 +3348,10 @@ define i32 @test_f64_olt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_olt_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmoval %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3384,13 +3384,13 @@ define i32 @test_f64_olt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_olt_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmoval %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3404,10 +3404,10 @@ define i32 @test_f64_olt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ole_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovael %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3421,10 +3421,10 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ole_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovael %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3457,13 +3457,13 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ole_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovael %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3477,10 +3477,10 @@ define i32 @test_f64_ole_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_one_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3494,10 +3494,10 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_one_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3530,13 +3530,13 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_one_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3550,10 +3550,10 @@ define i32 @test_f64_one_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ord_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3567,10 +3567,10 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ord_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3603,13 +3603,13 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ord_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3623,10 +3623,10 @@ define i32 @test_f64_ord_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ueq_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3640,10 +3640,10 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ueq_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3676,13 +3676,13 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ueq_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3696,10 +3696,10 @@ define i32 @test_f64_ueq_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ugt_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3713,10 +3713,10 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ugt_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3749,13 +3749,13 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ugt_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3769,10 +3769,10 @@ define i32 @test_f64_ugt_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_uge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_uge_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3786,10 +3786,10 @@ define i32 @test_f64_uge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_uge_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3822,13 +3822,13 @@ define i32 @test_f64_uge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_uge_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3842,10 +3842,10 @@ define i32 @test_f64_uge_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ult_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3859,10 +3859,10 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ult_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3895,13 +3895,13 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ult_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3915,10 +3915,10 @@ define i32 @test_f64_ult_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_ule_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_ule_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovbel %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -3932,10 +3932,10 @@ define i32 @test_f64_ule_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_ule_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovbel %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -3968,13 +3968,13 @@ define i32 @test_f64_ule_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_ule_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovbel %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
@@ -3988,10 +3988,10 @@ define i32 @test_f64_ule_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_une_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_une_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovnel %eax, %ecx
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
@@ -4007,10 +4007,10 @@ define i32 @test_f64_une_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_une_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovnel %eax, %ecx
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
@@ -4031,27 +4031,27 @@ define i32 @test_f64_une_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; X87-NEXT:    fcompp
 ; X87-NEXT:    wait
 ; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X87-NEXT:    sahf
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    jne .LBB54_3
 ; X87-NEXT:  # %bb.1:
 ; X87-NEXT:    jp .LBB54_3
 ; X87-NEXT:  # %bb.2:
-; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-NEXT:  .LBB54_3:
-; X87-NEXT:    movl (%eax), %eax
+; X87-NEXT:    movl (%ecx), %eax
 ; X87-NEXT:    retl
 ;
 ; X87-CMOV-LABEL: test_f64_une_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovnel %eax, %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
@@ -4066,10 +4066,10 @@ define i32 @test_f64_une_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ; SSE-32-LABEL: test_f64_uno_s:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    comisd {{[0-9]+}}(%esp), %xmm0
 ; SSE-32-NEXT:    cmovpl %eax, %ecx
 ; SSE-32-NEXT:    movl (%ecx), %eax
 ; SSE-32-NEXT:    retl
@@ -4083,10 +4083,10 @@ define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; AVX-32-LABEL: test_f64_uno_s:
 ; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vcomisd {{[0-9]+}}(%esp), %xmm0
 ; AVX-32-NEXT:    cmovpl %eax, %ecx
 ; AVX-32-NEXT:    movl (%ecx), %eax
 ; AVX-32-NEXT:    retl
@@ -4119,13 +4119,13 @@ define i32 @test_f64_uno_s(i32 %a, i32 %b, double %f1, double %f2) #0 {
 ;
 ; X87-CMOV-LABEL: test_f64_uno_s:
 ; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-CMOV-NEXT:    fcompi %st(1), %st
 ; X87-CMOV-NEXT:    fstp %st(0)
 ; X87-CMOV-NEXT:    wait
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X87-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X87-CMOV-NEXT:    cmovpl %eax, %ecx
 ; X87-CMOV-NEXT:    movl (%ecx), %eax
 ; X87-CMOV-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index 61a0c4eda8c72..551c9f9b98fc5 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -230,9 +230,9 @@ define void @fpext_f16_to_f32(ptr %val, ptr %ret) nounwind strictfp {
 ; X86-LABEL: fpext_f16_to_f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -273,9 +273,9 @@ define void @fpext_f16_to_f64(ptr %val, ptr %ret) nounwind strictfp {
 ; X86-LABEL: fpext_f16_to_f64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -314,9 +314,9 @@ define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
 ; X86-LABEL: fptrunc_float_to_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -359,9 +359,9 @@ define void @fptrunc_double_to_f16(ptr %val, ptr%ret) nounwind strictfp {
 ; X86-LABEL: fptrunc_double_to_f16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsh %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
index ecdc507a882c3..ed1502f84e747 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
@@ -446,10 +446,10 @@ define i16 @fptoui_f32toi16(float %x) #0 {
 define i32 @fptoui_f32toi32(float %x) #0 {
 ; SSE-X86-LABEL: fptoui_f32toi32:
 ; SSE-X86:       # %bb.0:
+; SSE-X86-NEXT:    xorps %xmm1, %xmm1
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm2 = [2.14748365E+9,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-X86-NEXT:    comiss %xmm0, %xmm2
-; SSE-X86-NEXT:    xorps %xmm1, %xmm1
 ; SSE-X86-NEXT:    ja .LBB8_2
 ; SSE-X86-NEXT:  # %bb.1:
 ; SSE-X86-NEXT:    movaps %xmm2, %xmm1
@@ -1086,10 +1086,10 @@ define i16 @fptoui_f64toi16(double %x) #0 {
 define i32 @fptoui_f64toi32(double %x) #0 {
 ; SSE-X86-LABEL: fptoui_f64toi32:
 ; SSE-X86:       # %bb.0:
+; SSE-X86-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm2 = [2.147483648E+9,0.0E+0]
 ; SSE-X86-NEXT:    comisd %xmm0, %xmm2
-; SSE-X86-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-X86-NEXT:    ja .LBB17_2
 ; SSE-X86-NEXT:  # %bb.1:
 ; SSE-X86-NEXT:    movapd %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
index f0aa3827ce937..c25071311ab51 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
@@ -588,9 +588,9 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ; SSE-X86-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-X86-NEXT:    andl $-8, %esp
 ; SSE-X86-NEXT:    subl $16, %esp
-; SSE-X86-NEXT:    movl 12(%ebp), %eax
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movl 12(%ebp), %eax
 ; SSE-X86-NEXT:    shrl $31, %eax
 ; SSE-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -630,9 +630,9 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ; AVX-X86-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $16, %esp
-; AVX-X86-NEXT:    movl 12(%ebp), %eax
 ; AVX-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-X86-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX-X86-NEXT:    movl 12(%ebp), %eax
 ; AVX-X86-NEXT:    shrl $31, %eax
 ; AVX-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -1277,9 +1277,9 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ; SSE-X86-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-X86-NEXT:    andl $-8, %esp
 ; SSE-X86-NEXT:    subl $24, %esp
-; SSE-X86-NEXT:    movl 12(%ebp), %eax
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movl 12(%ebp), %eax
 ; SSE-X86-NEXT:    shrl $31, %eax
 ; SSE-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE-X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -1319,9 +1319,9 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ; AVX-X86-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $24, %esp
-; AVX-X86-NEXT:    movl 12(%ebp), %eax
 ; AVX-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-X86-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX-X86-NEXT:    movl 12(%ebp), %eax
 ; AVX-X86-NEXT:    shrl $31, %eax
 ; AVX-X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX-X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar.ll b/llvm/test/CodeGen/X86/fp-strict-scalar.ll
index f1be74f5c3ac4..8707402940e8d 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar.ll
@@ -418,9 +418,9 @@ define void @fpext_f32_to_f64(ptr %val, ptr %ret) nounwind strictfp {
 ; SSE-X86-LABEL: fpext_f32_to_f64:
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    cvtss2sd %xmm0, %xmm0
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    movsd %xmm0, (%eax)
 ; SSE-X86-NEXT:    retl
 ;
@@ -434,9 +434,9 @@ define void @fpext_f32_to_f64(ptr %val, ptr %ret) nounwind strictfp {
 ; AVX-X86-LABEL: fpext_f32_to_f64:
 ; AVX-X86:       # %bb.0:
 ; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%eax)
 ; AVX-X86-NEXT:    retl
 ;
@@ -450,8 +450,9 @@ define void @fpext_f32_to_f64(ptr %val, ptr %ret) nounwind strictfp {
 ; X87-LABEL: fpext_f32_to_f64:
 ; X87:       # %bb.0:
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X87-NEXT:    flds (%ecx)
+; X87-NEXT:    flds (%eax)
+; X87-NEXT:    wait
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    fstpl (%eax)
 ; X87-NEXT:    wait
 ; X87-NEXT:    retl
@@ -466,9 +467,9 @@ define void @fptrunc_double_to_f32(ptr %val, ptr%ret) nounwind strictfp {
 ; SSE-X86-LABEL: fptrunc_double_to_f32:
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    cvtsd2ss %xmm0, %xmm0
+; SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-X86-NEXT:    movss %xmm0, (%eax)
 ; SSE-X86-NEXT:    retl
 ;
@@ -482,9 +483,9 @@ define void @fptrunc_double_to_f32(ptr %val, ptr%ret) nounwind strictfp {
 ; AVX-X86-LABEL: fptrunc_double_to_f32:
 ; AVX-X86:       # %bb.0:
 ; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-X86-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-X86-NEXT:    vmovss %xmm0, (%eax)
 ; AVX-X86-NEXT:    retl
 ;
@@ -499,9 +500,10 @@ define void @fptrunc_double_to_f32(ptr %val, ptr%ret) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    pushl %eax
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X87-NEXT:    fldl (%ecx)
+; X87-NEXT:    fldl (%eax)
 ; X87-NEXT:    fstps (%esp)
+; X87-NEXT:    wait
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    flds (%esp)
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    wait
@@ -614,10 +616,10 @@ define double @fma_f64(double %a, double %b, double %c) nounwind strictfp {
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    subl $24, %esp
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-X86-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-X86-NEXT:    movsd %xmm2, {{[0-9]+}}(%esp)
-; SSE-X86-NEXT:    movsd %xmm1, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-X86-NEXT:    movsd %xmm0, (%esp)
 ; SSE-X86-NEXT:    calll fma
 ; SSE-X86-NEXT:    addl $24, %esp
@@ -655,10 +657,10 @@ define double @fma_f64(double %a, double %b, double %c) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    subl $24, %esp
 ; X87-NEXT:    fldl {{[0-9]+}}(%esp)
-; X87-NEXT:    fldl {{[0-9]+}}(%esp)
-; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstpl (%esp)
 ; X87-NEXT:    wait
 ; X87-NEXT:    calll fma
@@ -675,10 +677,10 @@ define float @fma_f32(float %a, float %b, float %c) nounwind strictfp {
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    subl $12, %esp
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    movss %xmm2, {{[0-9]+}}(%esp)
-; SSE-X86-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-X86-NEXT:    movss %xmm0, (%esp)
 ; SSE-X86-NEXT:    calll fmaf
 ; SSE-X86-NEXT:    addl $12, %esp
@@ -712,10 +714,10 @@ define float @fma_f32(float %a, float %b, float %c) nounwind strictfp {
 ; X87:       # %bb.0:
 ; X87-NEXT:    subl $12, %esp
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
-; X87-NEXT:    flds {{[0-9]+}}(%esp)
-; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps {{[0-9]+}}(%esp)
+; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps {{[0-9]+}}(%esp)
+; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps (%esp)
 ; X87-NEXT:    wait
 ; X87-NEXT:    calll fmaf
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index bb5640aeb66fa..d0c71c766b283 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -36,8 +36,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPExtF16_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    movzwl vf16, %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -48,15 +47,14 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp {
 ; X86-NEXT:    calll __extendsftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load half, ptr @vf16, align 2
@@ -86,8 +84,7 @@ define dso_local void @TestFPExtF32_F128() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPExtF32_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    flds vf32
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    wait
@@ -96,15 +93,14 @@ define dso_local void @TestFPExtF32_F128() nounwind strictfp {
 ; X86-NEXT:    calll __extendsftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load float, ptr @vf32, align 4
@@ -134,8 +130,7 @@ define dso_local void @TestFPExtF64_F128() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPExtF64_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    fldl vf64
 ; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X86-NEXT:    wait
@@ -144,15 +139,14 @@ define dso_local void @TestFPExtF64_F128() nounwind strictfp {
 ; X86-NEXT:    calll __extenddftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load double, ptr @vf64, align 8
@@ -186,8 +180,7 @@ define dso_local void @TestFPExtF80_F128() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPExtF80_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    fldt vf80
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
 ; X86-NEXT:    wait
@@ -196,15 +189,14 @@ define dso_local void @TestFPExtF80_F128() nounwind strictfp {
 ; X86-NEXT:    calll __extendxftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load x86_fp80, ptr @vf80, align 8
@@ -235,15 +227,21 @@ define dso_local void @TestFPTruncF128_F16() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPTruncF128_F16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfhf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movw %ax, vf16
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -273,16 +271,22 @@ define dso_local void @TestFPTruncF128_F32() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPTruncF128_F32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfsf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstps vf32
 ; X86-NEXT:    wait
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -312,16 +316,22 @@ define dso_local void @TestFPTruncF128_F64() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPTruncF128_F64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfdf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstpl vf64
 ; X86-NEXT:    wait
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -353,16 +363,22 @@ define dso_local void @TestFPTruncF128_F80() nounwind strictfp {
 ;
 ; X86-LABEL: TestFPTruncF128_F80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfxf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstpt vf80
 ; X86-NEXT:    wait
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -382,15 +398,21 @@ define dso_local i8 @fptosi_i8(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptosi_i8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixtfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i8 @llvm.experimental.constrained.fptosi.i8.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -408,15 +430,21 @@ define i16 @fptosi_i16(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptosi_i16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixtfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i16 @llvm.experimental.constrained.fptosi.i16.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -433,13 +461,19 @@ define dso_local i32 @fptosi_i32(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptosi_i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixtfsi
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -456,13 +490,19 @@ define i64 @fptosi_i64(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptosi_i64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixtfdi
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -484,11 +524,11 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -524,15 +564,21 @@ define dso_local i8 @fptoui_i8(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptoui_i8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixunstfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i8 @llvm.experimental.constrained.fptoui.i8.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -550,15 +596,21 @@ define i16 @fptoui_i16(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptoui_i16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixunstfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i16 @llvm.experimental.constrained.fptoui.i16.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -575,13 +627,19 @@ define dso_local i32 @fptoui_i32(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptoui_i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixunstfsi
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -598,13 +656,19 @@ define i64 @fptoui_i64(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: fptoui_i64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __fixunstfdi
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -626,11 +690,11 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -670,8 +734,8 @@ define fp128 @sitofp_i8(i8 %x) nounwind strictfp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
@@ -710,8 +774,8 @@ define fp128 @sitofp_i16(i16 %x) nounwind strictfp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
@@ -750,9 +814,10 @@ define fp128 @sitofp_i32(i32 %x) nounwind strictfp {
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __floatsitf
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    movl (%esp), %eax
@@ -788,10 +853,12 @@ define fp128 @sitofp_i64(i64 %x) nounwind strictfp {
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __floatditf
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    movl (%esp), %eax
@@ -827,11 +894,11 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -871,8 +938,8 @@ define fp128 @uitofp_i8(i8 %x) nounwind strictfp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
@@ -911,8 +978,8 @@ define fp128 @uitofp_i16(i16 %x) nounwind strictfp {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
@@ -951,9 +1018,10 @@ define fp128 @uitofp_i32(i32 %x) nounwind strictfp {
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __floatunsitf
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    movl (%esp), %eax
@@ -989,10 +1057,12 @@ define fp128 @uitofp_i64(i64 %x) nounwind strictfp {
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __floatunditf
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    movl (%esp), %eax
@@ -1028,11 +1098,11 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 6d4ec063ccd46..a05ec267e9a6e 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -33,8 +33,7 @@ define dso_local void @TestFPExtF32_F128() nounwind {
 ;
 ; X86-LABEL: TestFPExtF32_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    flds vf32
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -42,15 +41,14 @@ define dso_local void @TestFPExtF32_F128() nounwind {
 ; X86-NEXT:    calll __extendsftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPExtF32_F128:
@@ -80,8 +78,7 @@ define dso_local void @TestFPExtF64_F128() nounwind {
 ;
 ; X86-LABEL: TestFPExtF64_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    fldl vf64
 ; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -89,15 +86,14 @@ define dso_local void @TestFPExtF64_F128() nounwind {
 ; X86-NEXT:    calll __extenddftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPExtF64_F128:
@@ -128,8 +124,7 @@ define dso_local void @TestFPExtF80_F128() nounwind {
 ;
 ; X86-LABEL: TestFPExtF80_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    fldt vf80
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -137,15 +132,14 @@ define dso_local void @TestFPExtF80_F128() nounwind {
 ; X86-NEXT:    calll __extendxftf2
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPExtF80_F128:
@@ -176,15 +170,21 @@ define dso_local void @TestFPToSIF128_I16() nounwind {
 ;
 ; X86-LABEL: TestFPToSIF128_I16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __fixtfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movw %ax, vi16
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToSIF128_I16:
@@ -214,15 +214,21 @@ define dso_local void @TestFPToUIF128_I16() nounwind {
 ;
 ; X86-LABEL: TestFPToUIF128_I16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __fixunstfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movw %ax, vi16
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToUIF128_I16:
@@ -252,15 +258,21 @@ define dso_local void @TestFPToSIF128_I32() nounwind {
 ;
 ; X86-LABEL: TestFPToSIF128_I32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __fixtfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, vi32
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToSIF128_I32:
@@ -290,15 +302,21 @@ define dso_local void @TestFPToUIF128_U32() nounwind {
 ;
 ; X86-LABEL: TestFPToUIF128_U32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __fixunstfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, vu32
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToUIF128_U32:
@@ -329,17 +347,23 @@ define dso_local void @TestFPToSIF128_I64() nounwind {
 ;
 ; X86-LABEL: TestFPToSIF128_I64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __fixtfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, vi64
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, vi64+4
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToSIF128_I64:
@@ -372,16 +396,22 @@ define dso_local void @TestFPToUIF128_U64() nounwind {
 ;
 ; X86-LABEL: TestFPToUIF128_U64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __fixunstfsi
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, vu64
 ; X86-NEXT:    movl $0, vu64+4
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToUIF128_U64:
@@ -414,30 +444,28 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ;
 ; X86-LABEL: TestFPToSIF128_I128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl vf128, %eax
-; X86-NEXT:    movl vf128+4, %ecx
-; X86-NEXT:    movl vf128+8, %edx
-; X86-NEXT:    movl vf128+12, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vi128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vi128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vi128+12
-; X86-NEXT:    movl %edx, vi128+8
 ; X86-NEXT:    movl %ecx, vi128+4
 ; X86-NEXT:    movl %eax, vi128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToSIF128_I128:
@@ -469,30 +497,28 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ;
 ; X86-LABEL: TestFPToUIF128_U128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl vf128, %eax
-; X86-NEXT:    movl vf128+4, %ecx
-; X86-NEXT:    movl vf128+8, %edx
-; X86-NEXT:    movl vf128+12, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vu128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vu128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vu128+12
-; X86-NEXT:    movl %edx, vu128+8
 ; X86-NEXT:    movl %ecx, vu128+4
 ; X86-NEXT:    movl %eax, vu128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPToUIF128_U128:
@@ -523,15 +549,21 @@ define dso_local void @TestFPTruncF128_F32() nounwind {
 ;
 ; X86-LABEL: TestFPTruncF128_F32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfsf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstps vf32
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPTruncF128_F32:
@@ -561,15 +593,21 @@ define dso_local void @TestFPTruncF128_F64() nounwind {
 ;
 ; X86-LABEL: TestFPTruncF128_F64:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfdf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstpl vf64
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPTruncF128_F64:
@@ -599,15 +637,21 @@ define dso_local void @TestFPTruncF128_F80() nounwind {
 ;
 ; X86-LABEL: TestFPTruncF128_F80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl vf128+8, %ecx
+; X86-NEXT:    movl vf128+4, %edx
+; X86-NEXT:    movl vf128, %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __trunctfxf2
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    fstpt vf80
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestFPTruncF128_F80:
@@ -637,25 +681,22 @@ define dso_local void @TestSIToFPI16_F128() nounwind {
 ;
 ; X86-LABEL: TestSIToFPI16_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $36, %esp
 ; X86-NEXT:    movswl vi16, %eax
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __floatsitf
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestSIToFPI16_F128:
@@ -685,25 +726,22 @@ define dso_local void @TestSIToFPU16_F128() nounwind {
 ;
 ; X86-LABEL: TestSIToFPU16_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $36, %esp
 ; X86-NEXT:    movzwl vi16, %eax
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __floatunsitf
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestSIToFPU16_F128:
@@ -733,23 +771,22 @@ define dso_local void @TestSIToFPI32_F128() nounwind {
 ;
 ; X86-LABEL: TestSIToFPI32_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi32
+; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    movl vi32, %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __floatsitf
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestSIToFPI32_F128:
@@ -779,23 +816,22 @@ define dso_local void @TestUIToFPU32_F128() #2 {
 ;
 ; X86-LABEL: TestUIToFPU32_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu32
+; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    movl vu32, %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __floatunsitf
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestUIToFPU32_F128:
@@ -825,24 +861,24 @@ define dso_local void @TestSIToFPI64_F128() nounwind {
 ;
 ; X86-LABEL: TestSIToFPI64_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi64+4
-; X86-NEXT:    pushl vi64
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl vi64+4, %eax
+; X86-NEXT:    movl vi64, %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __floatditf
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestSIToFPI64_F128:
@@ -872,24 +908,24 @@ define dso_local void @TestUIToFPU64_F128() #2 {
 ;
 ; X86-LABEL: TestUIToFPU64_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu64+4
-; X86-NEXT:    pushl vu64
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl vu64+4, %eax
+; X86-NEXT:    movl vu64, %ecx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __floatunditf
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestUIToFPU64_F128:
@@ -920,30 +956,28 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ;
 ; X86-LABEL: TestSIToFPI128_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl vi128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vi128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vi128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl vi128, %eax
-; X86-NEXT:    movl vi128+4, %ecx
-; X86-NEXT:    movl vi128+8, %edx
-; X86-NEXT:    movl vi128+12, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestSIToFPI128_F128:
@@ -975,30 +1009,28 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ;
 ; X86-LABEL: TestUIToFPU128_F128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl vu128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vu128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vu128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl vu128, %eax
-; X86-NEXT:    movl vu128+4, %ecx
-; X86-NEXT:    movl vu128+8, %edx
-; X86-NEXT:    movl vu128+12, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, vf128+8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestUIToFPU128_F128:
@@ -1032,22 +1064,28 @@ define dso_local i32 @TestConst128(fp128 %v) nounwind {
 ;
 ; X86-LABEL: TestConst128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pushl $1073676288 # imm = 0x3FFF0000
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __gttf2
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestConst128:
@@ -1083,22 +1121,28 @@ define dso_local i32 @TestConst128Zero(fp128 %v) nounwind {
 ;
 ; X86-LABEL: TestConst128Zero:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __gttf2
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestConst128Zero:
@@ -1153,17 +1197,17 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
@@ -1234,14 +1278,14 @@ define fp128 @TestPair128(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl $3, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl $3, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -1283,7 +1327,7 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1292,31 +1336,29 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X86-NEXT:    cmpl $50001, {{[0-9]+}}(%esp) # imm = 0xC351
 ; X86-NEXT:    jl .LBB26_4
 ; X86-NEXT:  # %bb.1: # %if.then
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __trunctfdf2
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
-; X86-NEXT:    testb $-128, {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    testb $-128, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB26_3
 ; X86-NEXT:  # %bb.2: # %if.then
 ; X86-NEXT:    fstp %st(1)
 ; X86-NEXT:    fldz
 ; X86-NEXT:  .LBB26_3: # %if.then
 ; X86-NEXT:    fstp %st(0)
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __extenddftf2
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:  .LBB26_4: # %cleanup
 ; X86-NEXT:    movl %edx, (%esi)
@@ -1324,7 +1366,7 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%esi)
 ; X86-NEXT:    movl %eax, 12(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $36, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index ad2d690fd7ed0..5c177ce79d937 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -41,28 +41,25 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
@@ -70,11 +67,8 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: add:
@@ -94,27 +88,26 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -128,10 +121,9 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -163,28 +155,25 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
@@ -192,11 +181,8 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sub:
@@ -216,27 +202,26 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -250,10 +235,9 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -285,28 +269,25 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
@@ -314,11 +295,8 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: mul:
@@ -338,27 +316,26 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -372,10 +349,9 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -407,28 +383,25 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
@@ -436,11 +409,8 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: div:
@@ -460,27 +430,26 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -494,10 +463,9 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -522,48 +490,42 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $92, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl $88, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%ebp)
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    addl $92, %esp
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $88, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: fma:
@@ -586,35 +548,34 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $96, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 52(%ebp), %ebx
-; WIN-X86-NEXT:    movl 56(%ebp), %edi
-; WIN-X86-NEXT:    movl 60(%ebp), %edx
-; WIN-X86-NEXT:    movl 64(%ebp), %ecx
-; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl 64(%ebp), %eax
+; WIN-X86-NEXT:    movl 68(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 56(%ebp), %eax
+; WIN-X86-NEXT:    movl 60(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 48(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 40(%ebp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 32(%ebp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -628,10 +589,9 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -656,28 +616,25 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
@@ -685,11 +642,8 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: frem:
@@ -709,27 +663,26 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -743,10 +696,9 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -771,16 +723,15 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -790,9 +741,8 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: ceil:
@@ -814,12 +764,12 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -861,16 +811,15 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: acos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -880,9 +829,8 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: acos:
@@ -904,12 +852,12 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -951,16 +899,15 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -970,9 +917,8 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cos:
@@ -994,12 +940,12 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1041,16 +987,15 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cosh:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1060,9 +1005,8 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cosh:
@@ -1084,12 +1028,12 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1131,16 +1075,15 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1150,9 +1093,8 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp:
@@ -1174,12 +1116,12 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1221,16 +1163,15 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1240,9 +1181,8 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp2:
@@ -1264,12 +1204,12 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1311,16 +1251,15 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1330,9 +1269,8 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: floor:
@@ -1354,12 +1292,12 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1401,16 +1339,15 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1420,9 +1357,8 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log:
@@ -1444,12 +1380,12 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1491,16 +1427,15 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1510,9 +1445,8 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log10:
@@ -1534,12 +1468,12 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1581,16 +1515,15 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1600,9 +1533,8 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log2:
@@ -1624,12 +1556,12 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1671,28 +1603,25 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaxf128
@@ -1700,11 +1629,8 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: maxnum:
@@ -1724,27 +1650,26 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -1758,10 +1683,9 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1786,27 +1710,24 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -1815,11 +1736,8 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: minnum:
@@ -1839,27 +1757,26 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -1873,10 +1790,9 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1901,16 +1817,15 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1920,9 +1835,8 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: nearbyint:
@@ -1944,12 +1858,12 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1991,27 +1905,24 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -2020,11 +1931,8 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: pow:
@@ -2044,27 +1952,26 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -2078,10 +1985,9 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2113,19 +2019,17 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2135,10 +2039,8 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $64, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: powi:
@@ -2155,20 +2057,19 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2183,10 +2084,9 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2211,16 +2111,15 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2230,9 +2129,8 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: rint:
@@ -2254,12 +2152,12 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2301,16 +2199,15 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2320,9 +2217,8 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: round:
@@ -2344,12 +2240,12 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2391,16 +2287,15 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2410,9 +2305,8 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: roundeven:
@@ -2434,12 +2328,12 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2481,16 +2375,15 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: asin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2500,9 +2393,8 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: asin:
@@ -2524,12 +2416,12 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2571,16 +2463,15 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2590,9 +2481,8 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sin:
@@ -2614,12 +2504,12 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2661,16 +2551,15 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sinh:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2680,9 +2569,8 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sinh:
@@ -2704,12 +2592,12 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2751,16 +2639,15 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2770,9 +2657,8 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sqrt:
@@ -2794,12 +2680,12 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2841,16 +2727,15 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: atan:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2860,9 +2745,8 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan:
@@ -2884,12 +2768,12 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -2929,30 +2813,27 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; GNU-NEXT:    popq %rax
 ; GNU-NEXT:    retq
 ;
-; X86-LABEL: atan2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
+; X86-LABEL: atan2:
+; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
@@ -2960,11 +2841,8 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan2:
@@ -2984,27 +2862,26 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    subl $64, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
@@ -3018,10 +2895,9 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    leal -8(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -3046,16 +2922,15 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3065,9 +2940,8 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tan:
@@ -3089,12 +2963,12 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3136,16 +3010,15 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tanh:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3155,9 +3028,8 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tanh:
@@ -3179,12 +3051,12 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3226,16 +3098,15 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $56, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3245,9 +3116,8 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: trunc:
@@ -3269,12 +3139,12 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -3316,13 +3186,19 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: lrint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll lrintf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: lrint:
@@ -3339,15 +3215,21 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl 8(%ebp), %edx
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl %edx
 ; WIN-X86-NEXT:    calll _lrintl
 ; WIN-X86-NEXT:    addl $16, %esp
-; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -3372,13 +3254,19 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: llrint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: llrint:
@@ -3395,15 +3283,21 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl 8(%ebp), %edx
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl %edx
 ; WIN-X86-NEXT:    calll _llrintl
 ; WIN-X86-NEXT:    addl $16, %esp
-; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -3428,13 +3322,19 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: lround:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll lroundf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: lround:
@@ -3451,15 +3351,21 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl 8(%ebp), %edx
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl %edx
 ; WIN-X86-NEXT:    calll _lroundl
 ; WIN-X86-NEXT:    addl $16, %esp
-; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -3484,13 +3390,19 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: llround:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llroundf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: llround:
@@ -3507,15 +3419,21 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl 8(%ebp), %edx
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl %edx
 ; WIN-X86-NEXT:    calll _llroundl
 ; WIN-X86-NEXT:    addl $16, %esp
-; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -3574,24 +3492,39 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; X86-LABEL: cmp:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %edx
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: cmp:
@@ -3620,15 +3553,24 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl 40(%ebp), %edx
+; WIN-X86-NEXT:    movl 44(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %ebx
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    pushl %ecx
 ; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl %edx
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %eax
 ; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
@@ -3642,7 +3584,10 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:  LBB37_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
-; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmp.f128(
@@ -3704,24 +3649,39 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; X86-LABEL: cmps:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %edx
 ; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: cmps:
@@ -3750,15 +3710,24 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl 40(%ebp), %edx
+; WIN-X86-NEXT:    movl 44(%ebp), %esi
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %ebx
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    pushl %ecx
 ; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl %edx
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
+; WIN-X86-NEXT:    pushl %eax
 ; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
@@ -3772,7 +3741,10 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:  LBB38_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
-; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
+; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmps.f128(
@@ -3872,16 +3844,16 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
 ; X86-NEXT:    addl $32, %esp
@@ -3892,8 +3864,8 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
 ; X86-NEXT:    addl $32, %esp
@@ -3956,15 +3928,15 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    movl 32(%ebp), %edi
-; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    movl 44(%ebp), %esi
+; WIN-X86-NEXT:    movl 28(%ebp), %edi
 ; WIN-X86-NEXT:    pushl 52(%ebp)
 ; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl 28(%ebp)
 ; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
@@ -3972,11 +3944,11 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    sete %bl
 ; WIN-X86-NEXT:    pushl 52(%ebp)
 ; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl 28(%ebp)
 ; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
@@ -4093,16 +4065,16 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
 ; X86-NEXT:    addl $32, %esp
@@ -4113,19 +4085,19 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    testb %bl, %al
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovnel %eax, %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %edx
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -4179,15 +4151,15 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $16, %esp
-; WIN-X86-NEXT:    movl 32(%ebp), %edi
-; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    movl 44(%ebp), %esi
+; WIN-X86-NEXT:    movl 28(%ebp), %edi
 ; WIN-X86-NEXT:    pushl 52(%ebp)
 ; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl 28(%ebp)
 ; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
@@ -4195,11 +4167,11 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    setne %bl
 ; WIN-X86-NEXT:    pushl 52(%ebp)
 ; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl 28(%ebp)
 ; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index 4b0449fd7502e..31e9bd7d84c7c 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -42,27 +42,23 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
@@ -70,10 +66,6 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Add:
@@ -94,42 +86,36 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl 24(%ebp), %edi
-; WIN-X86-NEXT:    movl 28(%ebp), %ebx
-; WIN-X86-NEXT:    movl 32(%ebp), %ecx
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -171,27 +157,23 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl vf128, %edi
-; X86-NEXT:    movl vf128+4, %ebx
-; X86-NEXT:    movl vf128+8, %ebp
-; X86-NEXT:    movl vf128+12, %eax
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
@@ -199,10 +181,6 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Add:
@@ -223,42 +201,36 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %esi
-; WIN-X86-NEXT:    movl 20(%ebp), %edi
-; WIN-X86-NEXT:    movl _vf128, %edx
-; WIN-X86-NEXT:    movl _vf128+4, %ebx
-; WIN-X86-NEXT:    movl _vf128+8, %ecx
-; WIN-X86-NEXT:    movl _vf128+12, %eax
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+8, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+4, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+8
-; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -295,27 +267,23 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
@@ -323,10 +291,6 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sub:
@@ -347,42 +311,36 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl 24(%ebp), %edi
-; WIN-X86-NEXT:    movl 28(%ebp), %ebx
-; WIN-X86-NEXT:    movl 32(%ebp), %ecx
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -424,27 +382,23 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl vf128, %edi
-; X86-NEXT:    movl vf128+4, %ebx
-; X86-NEXT:    movl vf128+8, %ebp
-; X86-NEXT:    movl vf128+12, %eax
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
@@ -452,10 +406,6 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Sub:
@@ -476,42 +426,36 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %esi
-; WIN-X86-NEXT:    movl 20(%ebp), %edi
-; WIN-X86-NEXT:    movl _vf128, %edx
-; WIN-X86-NEXT:    movl _vf128+4, %ebx
-; WIN-X86-NEXT:    movl _vf128+8, %ecx
-; WIN-X86-NEXT:    movl _vf128+12, %eax
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+8, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+4, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+8
-; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -548,26 +492,22 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -576,10 +516,6 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Mul:
@@ -600,42 +536,36 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl 24(%ebp), %edi
-; WIN-X86-NEXT:    movl 28(%ebp), %ebx
-; WIN-X86-NEXT:    movl 32(%ebp), %ecx
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -677,27 +607,23 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl vf128, %edi
-; X86-NEXT:    movl vf128+4, %ebx
-; X86-NEXT:    movl vf128+8, %ebp
-; X86-NEXT:    movl vf128+12, %eax
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
@@ -705,10 +631,6 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Mul:
@@ -729,42 +651,36 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %esi
-; WIN-X86-NEXT:    movl 20(%ebp), %edi
-; WIN-X86-NEXT:    movl _vf128, %edx
-; WIN-X86-NEXT:    movl _vf128+4, %ebx
-; WIN-X86-NEXT:    movl _vf128+8, %ecx
-; WIN-X86-NEXT:    movl _vf128+12, %eax
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+8, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+4, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+8
-; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -801,26 +717,22 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -829,10 +741,6 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Div:
@@ -853,42 +761,36 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl 24(%ebp), %edi
-; WIN-X86-NEXT:    movl 28(%ebp), %ebx
-; WIN-X86-NEXT:    movl 32(%ebp), %ecx
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -930,27 +832,23 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl vf128, %edi
-; X86-NEXT:    movl vf128+4, %ebx
-; X86-NEXT:    movl vf128+8, %ebp
-; X86-NEXT:    movl vf128+12, %eax
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
@@ -958,10 +856,6 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Div:
@@ -982,42 +876,36 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %esi
-; WIN-X86-NEXT:    movl 20(%ebp), %edi
-; WIN-X86-NEXT:    movl _vf128, %edx
-; WIN-X86-NEXT:    movl _vf128+4, %ebx
-; WIN-X86-NEXT:    movl _vf128+8, %ecx
-; WIN-X86-NEXT:    movl _vf128+12, %eax
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+8, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+4, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+8
-; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1046,26 +934,22 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
@@ -1074,10 +958,6 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rem:
@@ -1098,42 +978,36 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl 24(%ebp), %edi
-; WIN-X86-NEXT:    movl 28(%ebp), %ebx
-; WIN-X86-NEXT:    movl 32(%ebp), %ecx
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1165,27 +1039,23 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl vf128, %edi
-; X86-NEXT:    movl vf128+4, %ebx
-; X86-NEXT:    movl vf128+8, %ebp
-; X86-NEXT:    movl vf128+12, %eax
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+8, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128+4, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
@@ -1193,10 +1063,6 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Rem:
@@ -1217,42 +1083,36 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 16(%ebp), %esi
-; WIN-X86-NEXT:    movl 20(%ebp), %edi
-; WIN-X86-NEXT:    movl _vf128, %edx
-; WIN-X86-NEXT:    movl _vf128+4, %ebx
-; WIN-X86-NEXT:    movl _vf128+8, %ecx
-; WIN-X86-NEXT:    movl _vf128+12, %eax
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 12(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+8, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128+4, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl _vf128, %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+8
-; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1281,14 +1141,13 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1297,8 +1156,7 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sqrt:
@@ -1316,30 +1174,28 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1368,14 +1224,13 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1384,8 +1239,7 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sin:
@@ -1403,30 +1257,28 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1455,14 +1307,13 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1471,8 +1322,7 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Cos:
@@ -1490,30 +1340,28 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1542,14 +1390,13 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1558,8 +1405,7 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Ceil:
@@ -1577,30 +1423,28 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1629,14 +1473,13 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1645,8 +1488,7 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Floor:
@@ -1664,30 +1506,28 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1716,14 +1556,13 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1732,8 +1571,7 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Trunc:
@@ -1751,30 +1589,28 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1803,14 +1639,13 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1819,8 +1654,7 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Nearbyint:
@@ -1838,30 +1672,28 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1890,14 +1722,13 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1906,8 +1737,7 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rint:
@@ -1925,30 +1755,28 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1977,14 +1805,13 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1993,8 +1820,7 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Round:
@@ -2012,30 +1838,28 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %eax
+; WIN-X86-NEXT:    movl 20(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 8(%ebp), %eax
 ; WIN-X86-NEXT:    movl 12(%ebp), %ecx
-; WIN-X86-NEXT:    movl 16(%ebp), %edx
-; WIN-X86-NEXT:    movl 20(%ebp), %esi
-; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+12
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, _vf128+8
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    movl %esi, _vf128+12
-; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2056,48 +1880,39 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $92, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%ebp)
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    addl $92, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128FMA:
@@ -2120,52 +1935,45 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $96, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 52(%ebp), %ebx
-; WIN-X86-NEXT:    movl 56(%ebp), %edi
-; WIN-X86-NEXT:    movl 60(%ebp), %edx
-; WIN-X86-NEXT:    movl 64(%ebp), %ecx
-; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl 64(%ebp), %eax
+; WIN-X86-NEXT:    movl 68(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 56(%ebp), %eax
+; WIN-X86-NEXT:    movl 60(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 48(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 40(%ebp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 32(%ebp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2185,28 +1993,23 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Acos:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Acos:
@@ -2223,34 +2026,29 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.acos.f128(fp128 %a)
@@ -2268,28 +2066,23 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Asin:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Asin:
@@ -2306,34 +2099,29 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.asin.f128(fp128 %a)
@@ -2351,28 +2139,23 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Atan:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan:
@@ -2389,34 +2172,29 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan.f128(fp128 %a)
@@ -2434,40 +2212,31 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ;
 ; X86-LABEL: Test128Atan2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    addl $76, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan2:
@@ -2487,44 +2256,37 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
 ; WIN-X86-NEXT:    subl $80, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl 40(%ebp), %ebx
-; WIN-X86-NEXT:    movl 44(%ebp), %edx
-; WIN-X86-NEXT:    movl 48(%ebp), %ecx
-; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl 44(%ebp), %ecx
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 32(%ebp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
@@ -2542,28 +2304,23 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Cosh:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Cosh:
@@ -2580,34 +2337,29 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.cosh.f128(fp128 %a)
@@ -2625,28 +2377,23 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Sinh:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Sinh:
@@ -2663,34 +2410,29 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.sinh.f128(fp128 %a)
@@ -2708,28 +2450,23 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tan:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tan:
@@ -2746,34 +2483,29 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.tan.f128(fp128 %a)
@@ -2791,28 +2523,23 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tanh:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $52, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tanh:
@@ -2829,34 +2556,29 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $48, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl %edx, 8(%esi)
-; WIN-X86-NEXT:    movl %ecx, 4(%esi)
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.tanh.f128(fp128 %a)
@@ -2884,34 +2606,27 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Modf:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll modff128
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, 16(%eax)
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
-; X86-NEXT:    movaps %xmm1, 16(%esi)
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $80, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Modf:
@@ -2930,52 +2645,39 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
-; WIN-X86-NEXT:    pushl %ebx
-; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $112, %esp
-; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    subl $96, %esp
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %ecx
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl 24(%ebp), %eax
 ; WIN-X86-NEXT:    movl 28(%ebp), %ecx
-; WIN-X86-NEXT:    movl 32(%ebp), %edx
-; WIN-X86-NEXT:    movl 36(%ebp), %edi
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
-; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _modfl
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 28(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 24(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN-X86-NEXT:    movl %edx, 20(%eax)
+; WIN-X86-NEXT:    movl %ecx, 16(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 12(%eax)
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    movl %ecx, 8(%eax)
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN-X86-NEXT:    movl %edx, 28(%esi)
-; WIN-X86-NEXT:    movl %eax, 24(%esi)
-; WIN-X86-NEXT:    movl %ecx, 20(%esi)
-; WIN-X86-NEXT:    movl %ebx, 16(%esi)
-; WIN-X86-NEXT:    movl %edi, 12(%esi)
-; WIN-X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN-X86-NEXT:    movl %eax, 8(%esi)
-; WIN-X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN-X86-NEXT:    movl %eax, 4(%esi)
-; WIN-X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN-X86-NEXT:    movl %eax, (%esi)
-; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -12(%ebp), %esp
-; WIN-X86-NEXT:    popl %esi
-; WIN-X86-NEXT:    popl %edi
-; WIN-X86-NEXT:    popl %ebx
+; WIN-X86-NEXT:    movl %edx, 4(%eax)
+; WIN-X86-NEXT:    movl %ecx, (%eax)
+; WIN-X86-NEXT:    movl %ebp, %esp
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call { fp128, fp128 } @llvm.modf.f128(fp128 %a)
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 6abdf9a5ba652..a004842dcaa4d 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -35,10 +35,8 @@ define void @test_half_ceil(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_ceil:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -47,9 +45,9 @@ define void @test_half_ceil(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.ceil.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -87,13 +85,13 @@ define void @test_half_copysign(half %a0, half %a1, ptr %p0) nounwind {
 ; X86-LABEL: test_half_copysign:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    por %xmm1, %xmm0
-; X86-NEXT:    pextrw $0, %xmm0, %ecx
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    por %xmm0, %xmm1
+; X86-NEXT:    pextrw $0, %xmm1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    retl
   %res = call half @llvm.copysign.half(half %a0, half %a1)
   store half %res, ptr %p0, align 2
@@ -137,10 +135,8 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_cos:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -149,9 +145,9 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.cos.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -195,10 +191,8 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_exp:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -207,9 +201,9 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.exp.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -253,10 +247,8 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_exp2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -265,9 +257,9 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.exp2.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -311,10 +303,8 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_exp10:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -323,9 +313,9 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.exp10.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -355,10 +345,10 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_fabs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    retl
   %res = call half @llvm.fabs.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -394,10 +384,8 @@ define void @test_half_floor(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_floor:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -406,9 +394,9 @@ define void @test_half_floor(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.floor.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -472,24 +460,18 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_fma:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $88, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    subl $60, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -502,9 +484,9 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; X86-NEXT:    fstpl (%esp)
 ; X86-NEXT:    calll __truncdfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $88, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.fma.half(half %a0, half %a1, half %a2)
   store half %res, ptr %p0, align 2
@@ -534,10 +516,10 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_fneg:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movl $32768, %eax # imm = 0x8000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    retl
   %res = fneg half %a0
   store half %res, ptr %p0, align 2
@@ -581,10 +563,8 @@ define void @test_half_log(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_log:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -593,9 +573,9 @@ define void @test_half_log(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.log.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -639,10 +619,8 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_log2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -651,9 +629,9 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.log2.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -697,10 +675,8 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_log10:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -709,9 +685,9 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.log10.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -747,10 +723,8 @@ define void @test_half_nearbyint(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_nearbyint:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -759,9 +733,9 @@ define void @test_half_nearbyint(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.nearbyint.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -817,17 +791,13 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_pow:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    subl $28, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -838,9 +808,9 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $56, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.pow.half(half %a0, half %a1)
   store half %res, ptr %p0, align 2
@@ -890,25 +860,21 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_powi:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __powisf2
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $20, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.powi.half(half %a0, i32 %a1)
   store half %res, ptr %p0, align 2
@@ -944,10 +910,8 @@ define void @test_half_rint(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_rint:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -956,9 +920,9 @@ define void @test_half_rint(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.rint.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -1002,10 +966,8 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_sin:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -1014,9 +976,9 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.sin.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -1052,10 +1014,8 @@ define void @test_half_sqrt(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_sqrt:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -1065,9 +1025,9 @@ define void @test_half_sqrt(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.sqrt.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -1111,10 +1071,8 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_tan:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -1123,9 +1081,9 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.tan.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -1161,10 +1119,8 @@ define void @test_half_trunc(half %a0, ptr %p0) nounwind {
 ;
 ; X86-LABEL: test_half_trunc:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -1173,9 +1129,9 @@ define void @test_half_trunc(half %a0, ptr %p0) nounwind {
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %res = call half @llvm.trunc.half(half %a0)
   store half %res, ptr %p0, align 2
@@ -1357,15 +1313,13 @@ define half @test_half_atan2(half %a, half %b) nounwind {
 ;
 ; X86-LABEL: test_half_atan2:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $60, %esp
-; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    subl $28, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; X86-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
@@ -1375,7 +1329,7 @@ define half @test_half_atan2(half %a, half %b) nounwind {
 ; X86-NEXT:    calll atan2f
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll __truncsfhf2
-; X86-NEXT:    addl $60, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
   %x = call half @llvm.atan2.f16(half %a, half %b)
   ret half %x
diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
index 8bbc6247dbafd..7c6cfafa922e9 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
@@ -7,10 +7,10 @@ define x86_fp80 @fma(x86_fp80 %x, x86_fp80 %y, x86_fp80 %z) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $36, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll fmal
@@ -40,8 +40,8 @@ define x86_fp80 @frem(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll fmodl
@@ -319,8 +319,8 @@ define x86_fp80 @maxnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll fmaxl
@@ -348,8 +348,8 @@ define x86_fp80 @minnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll fminl
@@ -402,8 +402,8 @@ define x86_fp80 @pow(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll powl
@@ -430,10 +430,9 @@ define x86_fp80 @powi(x86_fp80 %x, i32 %y) nounwind strictfp {
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    wait
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll __powixf2
@@ -634,8 +633,8 @@ define x86_fp80 @atan2(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    calll atan2l
diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
index cb2361fbb8d37..a63baf81cf9f4 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-scalar-cmp.ll
@@ -10,16 +10,16 @@ define i32 @test_oeq_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X86-NEXT:    fucompp
 ; X86-NEXT:    wait
 ; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    jne .LBB0_3
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    jp .LBB0_3
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:  .LBB0_3:
-; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_oeq_q:
@@ -455,16 +455,16 @@ define i32 @test_une_q(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X86-NEXT:    fucompp
 ; X86-NEXT:    wait
 ; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    jne .LBB12_3
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    jp .LBB12_3
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:  .LBB12_3:
-; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_une_q:
@@ -530,16 +530,16 @@ define i32 @test_oeq_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X86-NEXT:    fcompp
 ; X86-NEXT:    wait
 ; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    jne .LBB14_3
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    jp .LBB14_3
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:  .LBB14_3:
-; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_oeq_s:
@@ -975,16 +975,16 @@ define i32 @test_une_s(i32 %a, i32 %b, x86_fp80 %f1, x86_fp80 %f2) #0 {
 ; X86-NEXT:    fcompp
 ; X86-NEXT:    wait
 ; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    jne .LBB26_3
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    jp .LBB26_3
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:  .LBB26_3:
-; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl (%ecx), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_une_s:
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
index 9b7a43a29a942..efda32d15c1ad 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -37,9 +37,9 @@ define i1 @test_signed_i1_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    jb .LBB0_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -110,9 +110,9 @@ define i8 @test_signed_i8_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    jb .LBB1_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -122,9 +122,9 @@ define i8 @test_signed_i8_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    ja .LBB1_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %edx, %ecx
@@ -181,9 +181,9 @@ define i13 @test_signed_i13_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    jb .LBB2_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -193,9 +193,9 @@ define i13 @test_signed_i13_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    ja .LBB2_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -253,9 +253,9 @@ define i16 @test_signed_i16_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    jb .LBB3_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -265,9 +265,9 @@ define i16 @test_signed_i16_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    ja .LBB3_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -325,9 +325,9 @@ define i19 @test_signed_i19_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    jb .LBB4_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -337,9 +337,9 @@ define i19 @test_signed_i19_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB4_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -398,9 +398,9 @@ define i32 @test_signed_i32_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    jb .LBB5_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -410,9 +410,9 @@ define i32 @test_signed_i32_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB5_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -434,8 +434,8 @@ define i32 @test_signed_i32_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %ecx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
@@ -494,9 +494,9 @@ define i50 @test_signed_i50_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    ja .LBB6_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -542,8 +542,8 @@ define i50 @test_signed_i50_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $131071, %edx # imm = 0x1FFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -609,9 +609,9 @@ define i64 @test_signed_i64_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB7_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -657,8 +657,8 @@ define i64 @test_signed_i64_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -703,22 +703,23 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixsfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %ecx, %ecx
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-8, %ebx
+; X86-X87-NEXT:    movl $0, %eax
 ; X86-X87-NEXT:    jb .LBB8_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:  .LBB8_2:
-; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    movl $0, %ebp
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB8_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB8_4:
-; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-8, %edi
 ; X86-X87-NEXT:    jb .LBB8_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -728,44 +729,52 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $7, %esi
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
-; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB8_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %ecx, %esi
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB8_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $7, %edi
+; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-1, %ebp
+; X86-X87-NEXT:    movl $-1, %edi
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB8_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
+; X86-X87-NEXT:    movl %ebx, %ebp
+; X86-X87-NEXT:    movl %edx, %edi
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-X87-NEXT:  .LBB8_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %ebp
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jp .LBB8_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %edi, %edx
-; X86-X87-NEXT:    movl %esi, %eax
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB8_12:
-; X86-X87-NEXT:    movl %ebx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
-; X86-X87-NEXT:    movb %dl, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB8_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %edi, %edx
+; X86-X87-NEXT:  .LBB8_14:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB8_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB8_16:
+; X86-X87-NEXT:    movl %edx, (%eax)
+; X86-X87-NEXT:    jp .LBB8_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB8_18:
+; X86-X87-NEXT:    andl $15, %ecx
+; X86-X87-NEXT:    movb %cl, 12(%eax)
 ; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -775,55 +784,49 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i100_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixsfti
 ; X86-SSE-NEXT:    subl $4, %esp
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorl %ebp, %ebp
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $-8, %ebx
-; X86-SSE-NEXT:    movl $0, %ecx
-; X86-SSE-NEXT:    movl $0, %edx
-; X86-SSE-NEXT:    movl $0, %edi
-; X86-SSE-NEXT:    jb .LBB8_2
-; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    cmovbl %ecx, %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:  .LBB8_2:
+; X86-SSE-NEXT:    cmovbl %ecx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    cmovbl %ecx, %esi
+; X86-SSE-NEXT:    movl $-8, %eax
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $7, %edx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
+; X86-SSE-NEXT:    cmoval %eax, %esi
 ; X86-SSE-NEXT:    cmoval %eax, %edi
-; X86-SSE-NEXT:    cmoval %eax, %edx
-; X86-SSE-NEXT:    cmoval %eax, %ecx
-; X86-SSE-NEXT:    movl $7, %eax
-; X86-SSE-NEXT:    cmovbel %ebx, %eax
+; X86-SSE-NEXT:    cmoval %eax, %ebx
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ebp, %eax
-; X86-SSE-NEXT:    cmovpl %ebp, %ecx
-; X86-SSE-NEXT:    cmovpl %ebp, %edx
-; X86-SSE-NEXT:    cmovpl %ebp, %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
-; X86-SSE-NEXT:    movb %al, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, (%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
+; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i100_f32:
@@ -859,7 +862,7 @@ define i128 @test_signed_i128_f32(float %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fsts {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -872,129 +875,129 @@ define i128 @test_signed_i128_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixsfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-X87-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    jb .LBB9_2
+; X86-X87-NEXT:  # %bb.1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:  .LBB9_2:
 ; X86-X87-NEXT:    movl $0, %ebx
-; X86-X87-NEXT:    jae .LBB9_1
-; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jae .LBB9_3
+; X86-X87-NEXT:    jb .LBB9_4
+; X86-X87-NEXT:  # %bb.3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-X87-NEXT:  .LBB9_4:
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jb .LBB9_6
-; X86-X87-NEXT:  .LBB9_5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:  # %bb.5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB9_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-X87-NEXT:    movl $-1, %ecx
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB9_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl %edi, %ecx
+; X86-X87-NEXT:    movl %ebx, %ebp
 ; X86-X87-NEXT:  .LBB9_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB9_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB9_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %esi
 ; X86-X87-NEXT:    jp .LBB9_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, %edx
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %edx, %esi
 ; X86-X87-NEXT:  .LBB9_12:
-; X86-X87-NEXT:    movl %ebx, 12(%ecx)
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %eax, 4(%ecx)
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %esi, 12(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB9_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB9_14:
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB9_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB9_16:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %ecx
+; X86-X87-NEXT:    jp .LBB9_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB9_18:
+; X86-X87-NEXT:    movl %ecx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
-; X86-X87-NEXT:  .LBB9_1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jb .LBB9_4
-; X86-X87-NEXT:  .LBB9_3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-X87-NEXT:    jae .LBB9_5
-; X86-X87-NEXT:    jmp .LBB9_6
 ;
 ; X86-SSE-LABEL: test_signed_i128_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixsfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movl $0, %edi
+; X86-SSE-NEXT:    movl $0, %esi
+; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    jb .LBB9_2
+; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    cmovbl %ecx, %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:  .LBB9_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    cmoval %ebx, %edi
 ; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebp, %ebx
-; X86-SSE-NEXT:    movl $-1, %ebp
-; X86-SSE-NEXT:    cmoval %ebp, %edi
-; X86-SSE-NEXT:    cmoval %ebp, %edx
-; X86-SSE-NEXT:    cmoval %ebp, %eax
+; X86-SSE-NEXT:    cmovbel %eax, %ebx
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ecx, %eax
-; X86-SSE-NEXT:    cmovpl %ecx, %edx
-; X86-SSE-NEXT:    cmovpl %ecx, %edi
 ; X86-SSE-NEXT:    cmovpl %ecx, %ebx
-; X86-SSE-NEXT:    movl %ebx, 12(%esi)
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i128_f32:
@@ -1057,9 +1060,9 @@ define i1 @test_signed_i1_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    jb .LBB10_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -1130,9 +1133,9 @@ define i8 @test_signed_i8_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    jb .LBB11_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -1142,9 +1145,9 @@ define i8 @test_signed_i8_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    ja .LBB11_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %edx, %ecx
@@ -1201,9 +1204,9 @@ define i13 @test_signed_i13_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    jb .LBB12_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -1213,9 +1216,9 @@ define i13 @test_signed_i13_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    ja .LBB12_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -1273,9 +1276,9 @@ define i16 @test_signed_i16_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    jb .LBB13_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -1285,9 +1288,9 @@ define i16 @test_signed_i16_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    ja .LBB13_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -1345,9 +1348,9 @@ define i19 @test_signed_i19_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    jb .LBB14_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1357,9 +1360,9 @@ define i19 @test_signed_i19_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB14_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -1418,9 +1421,9 @@ define i32 @test_signed_i32_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    jb .LBB15_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1430,9 +1433,9 @@ define i32 @test_signed_i32_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB15_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -1512,9 +1515,9 @@ define i50 @test_signed_i50_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    ja .LBB16_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -1618,9 +1621,9 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB17_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -1666,8 +1669,8 @@ define i64 @test_signed_i64_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -1712,22 +1715,23 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixdfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %ecx, %ecx
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-8, %ebx
+; X86-X87-NEXT:    movl $0, %eax
 ; X86-X87-NEXT:    jb .LBB18_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:  .LBB18_2:
-; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    movl $0, %ebp
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB18_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB18_4:
-; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-8, %edi
 ; X86-X87-NEXT:    jb .LBB18_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1737,44 +1741,52 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $7, %esi
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
-; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB18_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %ecx, %esi
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB18_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $7, %edi
+; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-1, %ebp
+; X86-X87-NEXT:    movl $-1, %edi
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB18_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
+; X86-X87-NEXT:    movl %ebx, %ebp
+; X86-X87-NEXT:    movl %edx, %edi
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-X87-NEXT:  .LBB18_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %ebp
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jp .LBB18_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %edi, %edx
-; X86-X87-NEXT:    movl %esi, %eax
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB18_12:
-; X86-X87-NEXT:    movl %ebx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
-; X86-X87-NEXT:    movb %dl, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB18_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %edi, %edx
+; X86-X87-NEXT:  .LBB18_14:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB18_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB18_16:
+; X86-X87-NEXT:    movl %edx, (%eax)
+; X86-X87-NEXT:    jp .LBB18_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB18_18:
+; X86-X87-NEXT:    andl $15, %ecx
+; X86-X87-NEXT:    movb %cl, 12(%eax)
 ; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -1784,55 +1796,49 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i100_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixdfti
 ; X86-SSE-NEXT:    subl $4, %esp
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    xorl %ebp, %ebp
 ; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $-8, %ebx
-; X86-SSE-NEXT:    movl $0, %ecx
-; X86-SSE-NEXT:    movl $0, %edx
-; X86-SSE-NEXT:    movl $0, %edi
-; X86-SSE-NEXT:    jb .LBB18_2
-; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    cmovbl %ecx, %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:  .LBB18_2:
+; X86-SSE-NEXT:    cmovbl %ecx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    cmovbl %ecx, %esi
+; X86-SSE-NEXT:    movl $-8, %eax
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $7, %edx
 ; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
+; X86-SSE-NEXT:    cmoval %eax, %esi
 ; X86-SSE-NEXT:    cmoval %eax, %edi
-; X86-SSE-NEXT:    cmoval %eax, %edx
-; X86-SSE-NEXT:    cmoval %eax, %ecx
-; X86-SSE-NEXT:    movl $7, %eax
-; X86-SSE-NEXT:    cmovbel %ebx, %eax
+; X86-SSE-NEXT:    cmoval %eax, %ebx
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ebp, %eax
-; X86-SSE-NEXT:    cmovpl %ebp, %ecx
-; X86-SSE-NEXT:    cmovpl %ebp, %edx
-; X86-SSE-NEXT:    cmovpl %ebp, %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
-; X86-SSE-NEXT:    movb %al, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, (%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
+; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i100_f64:
@@ -1868,7 +1874,7 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fstl {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1881,129 +1887,129 @@ define i128 @test_signed_i128_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixdfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-X87-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    jb .LBB19_2
+; X86-X87-NEXT:  # %bb.1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:  .LBB19_2:
 ; X86-X87-NEXT:    movl $0, %ebx
-; X86-X87-NEXT:    jae .LBB19_1
-; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jae .LBB19_3
+; X86-X87-NEXT:    jb .LBB19_4
+; X86-X87-NEXT:  # %bb.3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-X87-NEXT:  .LBB19_4:
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jb .LBB19_6
-; X86-X87-NEXT:  .LBB19_5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:  # %bb.5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB19_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-X87-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-X87-NEXT:    movl $-1, %ecx
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB19_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl %edi, %ecx
+; X86-X87-NEXT:    movl %ebx, %ebp
 ; X86-X87-NEXT:  .LBB19_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB19_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB19_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %esi
 ; X86-X87-NEXT:    jp .LBB19_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, %edx
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %edx, %esi
 ; X86-X87-NEXT:  .LBB19_12:
-; X86-X87-NEXT:    movl %ebx, 12(%ecx)
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %eax, 4(%ecx)
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %esi, 12(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB19_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB19_14:
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB19_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB19_16:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %ecx
+; X86-X87-NEXT:    jp .LBB19_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB19_18:
+; X86-X87-NEXT:    movl %ecx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
-; X86-X87-NEXT:  .LBB19_1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jb .LBB19_4
-; X86-X87-NEXT:  .LBB19_3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-X87-NEXT:    jae .LBB19_5
-; X86-X87-NEXT:    jmp .LBB19_6
 ;
 ; X86-SSE-LABEL: test_signed_i128_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixdfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movl $0, %edi
+; X86-SSE-NEXT:    movl $0, %esi
+; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    jb .LBB19_2
+; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    cmovbl %ecx, %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:  .LBB19_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    cmoval %ebx, %edi
 ; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebp, %ebx
-; X86-SSE-NEXT:    movl $-1, %ebp
-; X86-SSE-NEXT:    cmoval %ebp, %edi
-; X86-SSE-NEXT:    cmoval %ebp, %edx
-; X86-SSE-NEXT:    cmoval %ebp, %eax
+; X86-SSE-NEXT:    cmovbel %eax, %ebx
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ecx, %eax
-; X86-SSE-NEXT:    cmovpl %ecx, %edx
-; X86-SSE-NEXT:    cmovpl %ecx, %edi
 ; X86-SSE-NEXT:    cmovpl %ecx, %ebx
-; X86-SSE-NEXT:    movl %ebx, 12(%esi)
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i128_f64:
@@ -2055,6 +2061,9 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
 ; X86-X87-NEXT:    subl $24, %esp
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:    movl %eax, (%esp)
+; X86-X87-NEXT:    fld1
+; X86-X87-NEXT:    fchs
+; X86-X87-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-X87-NEXT:    calll __extendhfsf2
 ; X86-X87-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -2063,15 +2072,14 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-X87-NEXT:    fld1
-; X86-X87-NEXT:    fchs
+; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    jb .LBB20_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -2160,9 +2168,9 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    jb .LBB21_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -2172,9 +2180,9 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    ja .LBB21_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %edx, %ecx
@@ -2247,9 +2255,9 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    jb .LBB22_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -2259,9 +2267,9 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    ja .LBB22_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -2335,9 +2343,9 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    jb .LBB23_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -2347,9 +2355,9 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    ja .LBB23_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -2423,9 +2431,9 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    jb .LBB24_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -2435,9 +2443,9 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB24_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -2508,9 +2516,9 @@ define i32 @test_signed_i32_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    jb .LBB25_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -2520,9 +2528,9 @@ define i32 @test_signed_i32_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB25_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -2550,11 +2558,11 @@ define i32 @test_signed_i32_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmovael %eax, %ecx
 ; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %ecx, %edx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
@@ -2622,9 +2630,9 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    ja .LBB26_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -2675,8 +2683,8 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $131071, %edx # imm = 0x1FFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -2747,9 +2755,9 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB27_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -2800,8 +2808,8 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -2854,22 +2862,23 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixsfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %ecx, %ecx
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-8, %ebx
+; X86-X87-NEXT:    movl $0, %eax
 ; X86-X87-NEXT:    jb .LBB28_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:  .LBB28_2:
-; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    movl $0, %ebp
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB28_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB28_4:
-; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-8, %edi
 ; X86-X87-NEXT:    jb .LBB28_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2879,44 +2888,52 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $7, %esi
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
-; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB28_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %ecx, %esi
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB28_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $7, %edi
+; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-1, %ebp
+; X86-X87-NEXT:    movl $-1, %edi
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB28_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
+; X86-X87-NEXT:    movl %ebx, %ebp
+; X86-X87-NEXT:    movl %edx, %edi
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-X87-NEXT:  .LBB28_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %ebp
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jp .LBB28_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %edi, %edx
-; X86-X87-NEXT:    movl %esi, %eax
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB28_12:
-; X86-X87-NEXT:    movl %ebx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
-; X86-X87-NEXT:    movb %dl, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB28_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %edi, %edx
+; X86-X87-NEXT:  .LBB28_14:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB28_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB28_16:
+; X86-X87-NEXT:    movl %edx, (%eax)
+; X86-X87-NEXT:    jp .LBB28_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB28_18:
+; X86-X87-NEXT:    andl $15, %ecx
+; X86-X87-NEXT:    movb %cl, 12(%eax)
 ; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -2926,13 +2943,11 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i100_f16:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
+; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
@@ -2944,44 +2959,40 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    calll __fixsfti
 ; X86-SSE-NEXT:    subl $4, %esp
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorl %ebp, %ebp
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $-8, %ebx
-; X86-SSE-NEXT:    movl $0, %ecx
-; X86-SSE-NEXT:    movl $0, %edx
-; X86-SSE-NEXT:    movl $0, %edi
-; X86-SSE-NEXT:    jb .LBB28_2
-; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    cmovbl %ecx, %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:  .LBB28_2:
+; X86-SSE-NEXT:    cmovbl %ecx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    cmovbl %ecx, %esi
+; X86-SSE-NEXT:    movl $-8, %eax
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $7, %edx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
+; X86-SSE-NEXT:    cmoval %eax, %esi
 ; X86-SSE-NEXT:    cmoval %eax, %edi
-; X86-SSE-NEXT:    cmoval %eax, %edx
-; X86-SSE-NEXT:    cmoval %eax, %ecx
-; X86-SSE-NEXT:    movl $7, %eax
-; X86-SSE-NEXT:    cmovbel %ebx, %eax
+; X86-SSE-NEXT:    cmoval %eax, %ebx
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ebp, %eax
-; X86-SSE-NEXT:    cmovpl %ebp, %ecx
-; X86-SSE-NEXT:    cmovpl %ebp, %edx
-; X86-SSE-NEXT:    cmovpl %ebp, %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
-; X86-SSE-NEXT:    movb %al, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, (%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
+; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i100_f16:
@@ -3018,7 +3029,7 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:    movl %eax, (%esp)
 ; X86-X87-NEXT:    calll __extendhfsf2
@@ -3033,91 +3044,89 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixsfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-X87-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    jb .LBB29_2
+; X86-X87-NEXT:  # %bb.1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:  .LBB29_2:
 ; X86-X87-NEXT:    movl $0, %ebx
-; X86-X87-NEXT:    jae .LBB29_1
-; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jae .LBB29_3
+; X86-X87-NEXT:    jb .LBB29_4
+; X86-X87-NEXT:  # %bb.3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-X87-NEXT:  .LBB29_4:
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jb .LBB29_6
-; X86-X87-NEXT:  .LBB29_5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:  # %bb.5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB29_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-X87-NEXT:    movl $-1, %ecx
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB29_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl %edi, %ecx
+; X86-X87-NEXT:    movl %ebx, %ebp
 ; X86-X87-NEXT:  .LBB29_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB29_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB29_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %esi
 ; X86-X87-NEXT:    jp .LBB29_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, %edx
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %edx, %esi
 ; X86-X87-NEXT:  .LBB29_12:
-; X86-X87-NEXT:    movl %ebx, 12(%ecx)
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %eax, 4(%ecx)
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %esi, 12(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB29_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB29_14:
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB29_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB29_16:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %ecx
+; X86-X87-NEXT:    jp .LBB29_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB29_18:
+; X86-X87-NEXT:    movl %ecx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
-; X86-X87-NEXT:  .LBB29_1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jb .LBB29_4
-; X86-X87-NEXT:  .LBB29_3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-X87-NEXT:    jae .LBB29_5
-; X86-X87-NEXT:    jmp .LBB29_6
 ;
 ; X86-SSE-LABEL: test_signed_i128_f16:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
+; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
@@ -3129,40 +3138,42 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    calll __fixsfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movl $0, %edi
+; X86-SSE-NEXT:    movl $0, %esi
+; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    jb .LBB29_2
+; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    cmovbl %ecx, %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:  .LBB29_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    cmoval %ebx, %edi
 ; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebp, %ebx
-; X86-SSE-NEXT:    movl $-1, %ebp
-; X86-SSE-NEXT:    cmoval %ebp, %edi
-; X86-SSE-NEXT:    cmoval %ebp, %edx
-; X86-SSE-NEXT:    cmoval %ebp, %eax
+; X86-SSE-NEXT:    cmovbel %eax, %ebx
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm0
-; X86-SSE-NEXT:    cmovpl %ecx, %eax
-; X86-SSE-NEXT:    cmovpl %ecx, %edx
-; X86-SSE-NEXT:    cmovpl %ecx, %edi
 ; X86-SSE-NEXT:    cmovpl %ecx, %ebx
-; X86-SSE-NEXT:    movl %ebx, 12(%esi)
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i128_f16:
@@ -3226,9 +3237,9 @@ define i1 @test_signed_i1_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-1, %dl
 ; X86-X87-NEXT:    jb .LBB30_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -3270,14 +3281,14 @@ define i1 @test_signed_i1_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    fld1
 ; X86-SSE-NEXT:    fchs
+; X86-SSE-NEXT:    movl $255, %ecx
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $255, %eax
-; X86-SSE-NEXT:    cmovael %ecx, %eax
+; X86-SSE-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    fldz
 ; X86-SSE-NEXT:    fxch %st(1)
@@ -3339,9 +3350,9 @@ define i8 @test_signed_i8_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $-128, %dl
 ; X86-X87-NEXT:    jb .LBB31_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
@@ -3351,9 +3362,9 @@ define i8 @test_signed_i8_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movb $127, %cl
 ; X86-X87-NEXT:    ja .LBB31_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %edx, %ecx
@@ -3381,18 +3392,18 @@ define i8 @test_signed_i8_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $128, %eax
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $128, %ecx
-; X86-SSE-NEXT:    cmovael %eax, %ecx
+; X86-SSE-NEXT:    cmovbl %eax, %ecx
+; X86-SSE-NEXT:    movl $127, %edx
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $127, %edx
 ; X86-SSE-NEXT:    cmovbel %ecx, %edx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    fucompi %st(0), %st
@@ -3450,9 +3461,9 @@ define i13 @test_signed_i13_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $61440, %ecx # imm = 0xF000
 ; X86-X87-NEXT:    jb .LBB32_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -3462,9 +3473,9 @@ define i13 @test_signed_i13_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-X87-NEXT:    ja .LBB32_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -3493,18 +3504,18 @@ define i13 @test_signed_i13_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $61440, %eax # imm = 0xF000
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $61440, %ecx # imm = 0xF000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
+; X86-SSE-NEXT:    cmovbl %eax, %ecx
+; X86-SSE-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $4095, %edx # imm = 0xFFF
 ; X86-SSE-NEXT:    cmovbel %ecx, %edx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    fucompi %st(0), %st
@@ -3562,9 +3573,9 @@ define i16 @test_signed_i16_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32768, %ecx # imm = 0x8000
 ; X86-X87-NEXT:    jb .LBB33_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
@@ -3574,9 +3585,9 @@ define i16 @test_signed_i16_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-X87-NEXT:    ja .LBB33_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -3605,18 +3616,18 @@ define i16 @test_signed_i16_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $32768, %eax # imm = 0x8000
+; X86-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $32768, %ecx # imm = 0x8000
-; X86-SSE-NEXT:    cmovael %eax, %ecx
+; X86-SSE-NEXT:    cmovbl %eax, %ecx
+; X86-SSE-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-SSE-NEXT:    cmovbel %ecx, %edx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    fucompi %st(0), %st
@@ -3674,9 +3685,9 @@ define i19 @test_signed_i19_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-262144, %ecx # imm = 0xFFFC0000
 ; X86-X87-NEXT:    jb .LBB34_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -3686,9 +3697,9 @@ define i19 @test_signed_i19_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB34_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -3717,20 +3728,20 @@ define i19 @test_signed_i19_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fistl {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw (%esp)
+; X86-SSE-NEXT:    movl $-262144, %eax # imm = 0xFFFC0000
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $-262144, %eax # imm = 0xFFFC0000
 ; X86-SSE-NEXT:    jb .LBB34_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:  .LBB34_2:
+; X86-SSE-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
 ; X86-SSE-NEXT:    cmovbel %eax, %ecx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    fucompi %st(0), %st
@@ -3788,9 +3799,9 @@ define i32 @test_signed_i32_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
 ; X86-X87-NEXT:    jb .LBB35_2
 ; X86-X87-NEXT:  # %bb.1:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -3800,9 +3811,9 @@ define i32 @test_signed_i32_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB35_4
 ; X86-X87-NEXT:  # %bb.3:
 ; X86-X87-NEXT:    movl %ecx, %edx
@@ -3831,20 +3842,20 @@ define i32 @test_signed_i32_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fistl {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw (%esp)
+; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    jb .LBB35_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:  .LBB35_2:
+; X86-SSE-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
 ; X86-SSE-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
 ; X86-SSE-NEXT:    cmovbel %eax, %ecx
 ; X86-SSE-NEXT:    xorl %eax, %eax
 ; X86-SSE-NEXT:    fucompi %st(0), %st
@@ -3923,9 +3934,9 @@ define i50 @test_signed_i50_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $131071, %esi # imm = 0x1FFFF
 ; X86-X87-NEXT:    ja .LBB36_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -3973,11 +3984,11 @@ define i50 @test_signed_i50_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-131072, %eax # imm = 0xFFFE0000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $131071, %edx # imm = 0x1FFFF
 ; X86-SSE-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $131071, %edx # imm = 0x1FFFF
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -4062,9 +4073,9 @@ define i64 @test_signed_i64_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB37_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl %edi, %esi
@@ -4112,11 +4123,11 @@ define i64 @test_signed_i64_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    cmovbl %ecx, %esi
 ; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -4185,22 +4196,23 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixxfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %ecx, %ecx
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-8, %ebx
+; X86-X87-NEXT:    movl $0, %eax
 ; X86-X87-NEXT:    jb .LBB38_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:  .LBB38_2:
-; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    movl $0, %ebp
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB38_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB38_4:
-; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-8, %edi
 ; X86-X87-NEXT:    jb .LBB38_6
 ; X86-X87-NEXT:  # %bb.5:
 ; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -4210,44 +4222,52 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $7, %esi
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
-; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB38_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %ecx, %esi
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB38_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $7, %edi
+; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl $-1, %ebp
+; X86-X87-NEXT:    movl $-1, %edi
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB38_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
+; X86-X87-NEXT:    movl %ebx, %ebp
+; X86-X87-NEXT:    movl %edx, %edi
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-X87-NEXT:  .LBB38_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %ebp
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jp .LBB38_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %edi, %edx
-; X86-X87-NEXT:    movl %esi, %eax
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB38_12:
-; X86-X87-NEXT:    movl %ebx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
-; X86-X87-NEXT:    movb %dl, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB38_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %edi, %edx
+; X86-X87-NEXT:  .LBB38_14:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB38_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB38_16:
+; X86-X87-NEXT:    movl %edx, (%eax)
+; X86-X87-NEXT:    jp .LBB38_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB38_18:
+; X86-X87-NEXT:    andl $15, %ecx
+; X86-X87-NEXT:    movb %cl, 12(%eax)
 ; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -4257,12 +4277,10 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i100_f80:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    subl $48, %esp
 ; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fld %st(0)
 ; X86-SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
@@ -4271,49 +4289,44 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixxfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-SSE-NEXT:    xorl %ebp, %ebp
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-SSE-NEXT:    fxch %st(1)
+; X86-SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $-8, %ebx
-; X86-SSE-NEXT:    movl $0, %ecx
-; X86-SSE-NEXT:    movl $0, %edx
-; X86-SSE-NEXT:    movl $0, %edi
-; X86-SSE-NEXT:    jb .LBB38_2
-; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    cmovbl %ecx, %ebx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:  .LBB38_2:
+; X86-SSE-NEXT:    cmovbl %ecx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    cmovbl %ecx, %esi
+; X86-SSE-NEXT:    movl $-8, %eax
+; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $7, %edx
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
+; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
+; X86-SSE-NEXT:    cmoval %eax, %esi
 ; X86-SSE-NEXT:    cmoval %eax, %edi
-; X86-SSE-NEXT:    cmoval %eax, %edx
-; X86-SSE-NEXT:    cmoval %eax, %ecx
-; X86-SSE-NEXT:    movl $7, %eax
-; X86-SSE-NEXT:    cmovbel %ebx, %eax
+; X86-SSE-NEXT:    cmoval %eax, %ebx
 ; X86-SSE-NEXT:    fucompi %st(0), %st
-; X86-SSE-NEXT:    cmovpl %ebp, %eax
-; X86-SSE-NEXT:    cmovpl %ebp, %ecx
-; X86-SSE-NEXT:    cmovpl %ebp, %edx
-; X86-SSE-NEXT:    cmovpl %ebp, %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
-; X86-SSE-NEXT:    movb %al, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    cmovpl %ecx, %ebx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, (%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
+; X86-SSE-NEXT:    addl $48, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i100_f80:
@@ -4371,90 +4384,88 @@ define i128 @test_signed_i128_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixxfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-X87-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    jb .LBB39_2
+; X86-X87-NEXT:  # %bb.1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:  .LBB39_2:
 ; X86-X87-NEXT:    movl $0, %ebx
-; X86-X87-NEXT:    jae .LBB39_1
-; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jae .LBB39_3
+; X86-X87-NEXT:    jb .LBB39_4
+; X86-X87-NEXT:  # %bb.3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-X87-NEXT:  .LBB39_4:
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    movl $0, %edx
 ; X86-X87-NEXT:    jb .LBB39_6
-; X86-X87-NEXT:  .LBB39_5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:  # %bb.5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB39_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-X87-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-X87-NEXT:    movl $-1, %ecx
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB39_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-X87-NEXT:    movl %edi, %ecx
+; X86-X87-NEXT:    movl %ebx, %ebp
 ; X86-X87-NEXT:  .LBB39_8:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-X87-NEXT:    ja .LBB39_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB39_10:
 ; X86-X87-NEXT:    fucomp %st(0)
 ; X86-X87-NEXT:    fnstsw %ax
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %esi
 ; X86-X87-NEXT:    jp .LBB39_12
 ; X86-X87-NEXT:  # %bb.11:
-; X86-X87-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %edi, %eax
-; X86-X87-NEXT:    movl %ebp, %edx
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-X87-NEXT:    movl %edx, %esi
 ; X86-X87-NEXT:  .LBB39_12:
-; X86-X87-NEXT:    movl %ebx, 12(%ecx)
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %eax, 4(%ecx)
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %esi, 12(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB39_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ebp, %edx
+; X86-X87-NEXT:  .LBB39_14:
+; X86-X87-NEXT:    movl %edx, 8(%eax)
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jp .LBB39_16
+; X86-X87-NEXT:  # %bb.15:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB39_16:
+; X86-X87-NEXT:    movl %edx, 4(%eax)
+; X86-X87-NEXT:    movl $0, %ecx
+; X86-X87-NEXT:    jp .LBB39_18
+; X86-X87-NEXT:  # %bb.17:
+; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-X87-NEXT:  .LBB39_18:
+; X86-X87-NEXT:    movl %ecx, (%eax)
 ; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
-; X86-X87-NEXT:  .LBB39_1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl $0, %edx
-; X86-X87-NEXT:    jb .LBB39_4
-; X86-X87-NEXT:  .LBB39_3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-X87-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-X87-NEXT:    jae .LBB39_5
-; X86-X87-NEXT:    jmp .LBB39_6
 ;
 ; X86-SSE-LABEL: test_signed_i128_f80:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %ebp
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $44, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    subl $48, %esp
 ; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fld %st(0)
 ; X86-SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
@@ -4463,44 +4474,47 @@ define i128 @test_signed_i128_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixxfti
 ; X86-SSE-NEXT:    subl $4, %esp
+; X86-SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; X86-SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
+; X86-SSE-NEXT:    movl $0, %edi
+; X86-SSE-NEXT:    movl $0, %esi
+; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    jb .LBB39_2
+; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    cmovbl %ecx, %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    cmovbl %ecx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    cmovbl %ecx, %edi
-; X86-SSE-NEXT:    movl $-2147483648, %ebp # imm = 0x80000000
-; X86-SSE-NEXT:    cmovael {{[0-9]+}}(%esp), %ebp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:  .LBB39_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    cmoval %ebx, %edi
 ; X86-SSE-NEXT:    movl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-SSE-NEXT:    cmovbel %ebp, %ebx
-; X86-SSE-NEXT:    movl $-1, %ebp
-; X86-SSE-NEXT:    cmoval %ebp, %edi
-; X86-SSE-NEXT:    cmoval %ebp, %edx
-; X86-SSE-NEXT:    cmoval %ebp, %eax
+; X86-SSE-NEXT:    cmovbel %eax, %ebx
 ; X86-SSE-NEXT:    fucompi %st(0), %st
-; X86-SSE-NEXT:    cmovpl %ecx, %eax
-; X86-SSE-NEXT:    cmovpl %ecx, %edx
-; X86-SSE-NEXT:    cmovpl %ecx, %edi
 ; X86-SSE-NEXT:    cmovpl %ecx, %ebx
-; X86-SSE-NEXT:    movl %ebx, 12(%esi)
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %ebx, 12(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edi
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmovpl %ecx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    addl $48, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
-; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl $4
 ;
 ; X64-LABEL: test_signed_i128_f80:
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
index a074c78d512f5..d0c07b565d61f 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -39,21 +39,22 @@ define i1 @test_unsigned_i1_f32(float %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB0_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB0_3
 ; X86-X87-NEXT:  .LBB0_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB0_3:
 ; X86-X87-NEXT:    fld1
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB0_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB0_5:
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
@@ -102,21 +103,22 @@ define i8 @test_unsigned_i8_f32(float %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB1_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB1_3
 ; X86-X87-NEXT:  .LBB1_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB1_3:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $-1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB1_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB1_5:
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
@@ -161,25 +163,25 @@ define i13 @test_unsigned_i13_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB2_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB2_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $8191, %ecx # imm = 0x1FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $8191, %eax # imm = 0x1FFF
 ; X86-X87-NEXT:    ja .LBB2_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB2_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -223,25 +225,25 @@ define i16 @test_unsigned_i16_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB3_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB3_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; X86-X87-NEXT:    ja .LBB3_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB3_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -286,34 +288,35 @@ define i19 @test_unsigned_i19_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB4_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB4_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $524287, %ecx # imm = 0x7FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $524287, %eax # imm = 0x7FFFF
 ; X86-X87-NEXT:    ja .LBB4_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB4_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: test_unsigned_i19_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    maxss %xmm1, %xmm0
-; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_unsigned_i19_f32:
@@ -345,24 +348,25 @@ define i32 @test_unsigned_i32_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB5_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB5_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB5_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB5_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -381,8 +385,8 @@ define i32 @test_unsigned_i32_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
 ; X86-SSE-NEXT:    cmovael %edx, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %eax
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    retl
 ;
@@ -453,18 +457,19 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB6_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB6_8:
 ; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB6_10
 ; X86-X87-NEXT:  # %bb.9:
 ; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB6_10:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $16, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    retl
@@ -473,10 +478,10 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $16, %esp
+; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm2
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB6_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -503,8 +508,8 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:  .LBB6_4:
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $262143, %edx # imm = 0x3FFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -580,15 +585,16 @@ define i64 @test_unsigned_i64_f32(float %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %edx
 ; X86-X87-NEXT:    ja .LBB7_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %esi, %eax
+; X86-X87-NEXT:    movl %esi, %ecx
 ; X86-X87-NEXT:    movl %edi, %edx
 ; X86-X87-NEXT:  .LBB7_8:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -597,10 +603,10 @@ define i64 @test_unsigned_i64_f32(float %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i64_f32:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $20, %esp
+; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm2
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB7_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -627,8 +633,8 @@ define i64 @test_unsigned_i64_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:  .LBB7_4:
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %ecx
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmoval %ecx, %edx
 ; X86-SSE-NEXT:    cmoval %ecx, %eax
 ; X86-SSE-NEXT:    addl $20, %esp
@@ -676,53 +682,57 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunssfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edi, %edi
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    movl $0, %ecx
 ; X86-X87-NEXT:    jb .LBB8_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:  .LBB8_2:
-; X86-X87-NEXT:    movl $0, %esi
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %edi
 ; X86-X87-NEXT:    jb .LBB8_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB8_4:
 ; X86-X87-NEXT:    jb .LBB8_6
 ; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB8_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $15, %eax
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB8_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB8_8:
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB8_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl %esi, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB8_10:
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
-; X86-X87-NEXT:    movb %al, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB8_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %ebx
+; X86-X87-NEXT:  .LBB8_12:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    movl $15, %edx
+; X86-X87-NEXT:    ja .LBB8_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB8_14:
+; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    movb %dl, 12(%eax)
 ; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -736,41 +746,40 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixunssfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB8_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB8_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $15, %ebx
-; X86-SSE-NEXT:    cmovbel %edi, %ebx
-; X86-SSE-NEXT:    movl $-1, %edi
-; X86-SSE-NEXT:    cmoval %edi, %edx
-; X86-SSE-NEXT:    cmoval %edi, %ecx
-; X86-SSE-NEXT:    cmoval %edi, %eax
-; X86-SSE-NEXT:    movl %eax, 8(%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
-; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
-; X86-SSE-NEXT:    movb %bl, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl $15, %edx
+; X86-SSE-NEXT:    cmovbel %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
 ; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -807,7 +816,7 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fsts {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -820,56 +829,66 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunssfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    jb .LBB9_2
-; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:  .LBB9_2:
 ; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    jb .LBB9_4
-; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jae .LBB9_1
+; X86-X87-NEXT:  # %bb.2:
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jae .LBB9_3
 ; X86-X87-NEXT:  .LBB9_4:
-; X86-X87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB9_6
-; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:  .LBB9_5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB9_6:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB9_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ebx, %eax
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB9_8:
-; X86-X87-NEXT:    movl %esi, 12(%ecx)
-; X86-X87-NEXT:    movl %edi, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 12(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB9_10
+; X86-X87-NEXT:  # %bb.9:
+; X86-X87-NEXT:    movl %edi, %esi
+; X86-X87-NEXT:  .LBB9_10:
+; X86-X87-NEXT:    movl %esi, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB9_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %esi
+; X86-X87-NEXT:  .LBB9_12:
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB9_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %ebx
+; X86-X87-NEXT:  .LBB9_14:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
+; X86-X87-NEXT:  .LBB9_1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jb .LBB9_4
+; X86-X87-NEXT:  .LBB9_3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    jae .LBB9_5
+; X86-X87-NEXT:    jmp .LBB9_6
 ;
 ; X86-SSE-LABEL: test_unsigned_i128_f32:
 ; X86-SSE:       # %bb.0:
@@ -877,39 +896,38 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixunssfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB9_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB9_2:
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %ebx
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 12(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 8(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %ecx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
-; X86-SSE-NEXT:    movl %eax, 12(%esi)
-; X86-SSE-NEXT:    movl %ecx, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %edi, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    movl %ecx, (%eax)
 ; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -974,21 +992,22 @@ define i1 @test_unsigned_i1_f64(double %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB10_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB10_3
 ; X86-X87-NEXT:  .LBB10_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB10_3:
 ; X86-X87-NEXT:    fld1
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB10_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB10_5:
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
@@ -1037,21 +1056,22 @@ define i8 @test_unsigned_i8_f64(double %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB11_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB11_3
 ; X86-X87-NEXT:  .LBB11_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB11_3:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $-1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB11_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB11_5:
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
@@ -1096,25 +1116,25 @@ define i13 @test_unsigned_i13_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB12_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB12_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $8191, %ecx # imm = 0x1FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $8191, %eax # imm = 0x1FFF
 ; X86-X87-NEXT:    ja .LBB12_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB12_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -1158,25 +1178,25 @@ define i16 @test_unsigned_i16_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB13_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB13_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; X86-X87-NEXT:    ja .LBB13_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB13_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -1221,34 +1241,35 @@ define i19 @test_unsigned_i19_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB14_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB14_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $524287, %ecx # imm = 0x7FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $524287, %eax # imm = 0x7FFFF
 ; X86-X87-NEXT:    ja .LBB14_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB14_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: test_unsigned_i19_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
-; X86-SSE-NEXT:    maxsd %xmm1, %xmm0
-; X86-SSE-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    maxsd %xmm0, %xmm1
+; X86-SSE-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: test_unsigned_i19_f64:
@@ -1280,38 +1301,39 @@ define i32 @test_unsigned_i32_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB15_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB15_2:
 ; X86-X87-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB15_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB15_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    retl
 ;
 ; X86-SSE-LABEL: test_unsigned_i32_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
-; X86-SSE-NEXT:    maxsd %xmm1, %xmm0
-; X86-SSE-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttsd2si %xmm0, %ecx
+; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    maxsd %xmm0, %xmm1
+; X86-SSE-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttsd2si %xmm1, %ecx
 ; X86-SSE-NEXT:    movl %ecx, %edx
 ; X86-SSE-NEXT:    sarl $31, %edx
-; X86-SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttsd2si %xmm0, %eax
+; X86-SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    andl %edx, %eax
 ; X86-SSE-NEXT:    orl %ecx, %eax
 ; X86-SSE-NEXT:    retl
@@ -1381,18 +1403,19 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB16_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB16_8:
 ; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB16_10
 ; X86-X87-NEXT:  # %bb.9:
 ; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB16_10:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $16, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    retl
@@ -1400,8 +1423,8 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i50_f64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $20, %esp
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    maxsd %xmm1, %xmm0
 ; X86-SSE-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
@@ -1492,15 +1515,16 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %edx
 ; X86-X87-NEXT:    ja .LBB17_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %esi, %eax
+; X86-X87-NEXT:    movl %esi, %ecx
 ; X86-X87-NEXT:    movl %edi, %edx
 ; X86-X87-NEXT:  .LBB17_8:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -1509,10 +1533,10 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; X86-SSE-LABEL: test_unsigned_i64_f64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $20, %esp
+; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm2
-; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB17_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    xorpd %xmm2, %xmm2
@@ -1539,8 +1563,8 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:  .LBB17_4:
-; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %ecx
+; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmoval %ecx, %edx
 ; X86-SSE-NEXT:    cmoval %ecx, %eax
 ; X86-SSE-NEXT:    addl $20, %esp
@@ -1588,53 +1612,57 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunsdfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edi, %edi
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    movl $0, %ecx
 ; X86-X87-NEXT:    jb .LBB18_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:  .LBB18_2:
-; X86-X87-NEXT:    movl $0, %esi
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %edi
 ; X86-X87-NEXT:    jb .LBB18_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB18_4:
 ; X86-X87-NEXT:    jb .LBB18_6
 ; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB18_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $15, %eax
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB18_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB18_8:
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB18_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl %esi, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB18_10:
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
-; X86-X87-NEXT:    movb %al, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB18_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %ebx
+; X86-X87-NEXT:  .LBB18_12:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    movl $15, %edx
+; X86-X87-NEXT:    ja .LBB18_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB18_14:
+; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    movb %dl, 12(%eax)
 ; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -1648,41 +1676,40 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixunsdfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm1
 ; X86-SSE-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB18_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB18_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $15, %ebx
-; X86-SSE-NEXT:    cmovbel %edi, %ebx
-; X86-SSE-NEXT:    movl $-1, %edi
-; X86-SSE-NEXT:    cmoval %edi, %edx
-; X86-SSE-NEXT:    cmoval %edi, %ecx
-; X86-SSE-NEXT:    cmoval %edi, %eax
-; X86-SSE-NEXT:    movl %eax, 8(%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
-; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
-; X86-SSE-NEXT:    movb %bl, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl $15, %edx
+; X86-SSE-NEXT:    cmovbel %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
 ; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -1719,7 +1746,7 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fstl {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1732,56 +1759,66 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunsdfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    jb .LBB19_2
-; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:  .LBB19_2:
 ; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    jb .LBB19_4
-; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jae .LBB19_1
+; X86-X87-NEXT:  # %bb.2:
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jae .LBB19_3
 ; X86-X87-NEXT:  .LBB19_4:
-; X86-X87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB19_6
-; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:  .LBB19_5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB19_6:
 ; X86-X87-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB19_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ebx, %eax
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB19_8:
-; X86-X87-NEXT:    movl %esi, 12(%ecx)
-; X86-X87-NEXT:    movl %edi, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 12(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB19_10
+; X86-X87-NEXT:  # %bb.9:
+; X86-X87-NEXT:    movl %edi, %esi
+; X86-X87-NEXT:  .LBB19_10:
+; X86-X87-NEXT:    movl %esi, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB19_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %esi
+; X86-X87-NEXT:  .LBB19_12:
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB19_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %ebx
+; X86-X87-NEXT:  .LBB19_14:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
+; X86-X87-NEXT:  .LBB19_1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jb .LBB19_4
+; X86-X87-NEXT:  .LBB19_3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    jae .LBB19_5
+; X86-X87-NEXT:    jmp .LBB19_6
 ;
 ; X86-SSE-LABEL: test_unsigned_i128_f64:
 ; X86-SSE:       # %bb.0:
@@ -1789,39 +1826,38 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixunsdfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm1
 ; X86-SSE-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB19_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB19_2:
-; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %ebx
+; X86-SSE-NEXT:    ucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 12(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 8(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %ecx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
-; X86-SSE-NEXT:    movl %eax, 12(%esi)
-; X86-SSE-NEXT:    movl %ecx, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %edi, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    movl %ecx, (%eax)
 ; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -1888,21 +1924,22 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB20_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB20_3
 ; X86-X87-NEXT:  .LBB20_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB20_3:
 ; X86-X87-NEXT:    fld1
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB20_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB20_5:
 ; X86-X87-NEXT:    addl $12, %esp
 ; X86-X87-NEXT:    retl
@@ -1915,11 +1952,11 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    maxss %xmm1, %xmm0
-; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -1962,21 +1999,22 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB21_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB21_3
 ; X86-X87-NEXT:  .LBB21_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB21_3:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $-1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB21_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB21_5:
 ; X86-X87-NEXT:    addl $12, %esp
 ; X86-X87-NEXT:    retl
@@ -1987,13 +2025,13 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    calll __extendhfsf2
-; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    maxss %xmm1, %xmm0
-; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    calll __extendhfsf2
+; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2032,25 +2070,25 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB22_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB22_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $8191, %ecx # imm = 0x1FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $8191, %eax # imm = 0x1FFF
 ; X86-X87-NEXT:    ja .LBB22_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB22_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $12, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -2062,11 +2100,11 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    maxss %xmm1, %xmm0
-; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2105,25 +2143,25 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB23_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB23_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; X86-X87-NEXT:    ja .LBB23_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB23_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $12, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -2135,11 +2173,11 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    maxss %xmm1, %xmm0
-; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2179,24 +2217,25 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB24_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB24_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $524287, %ecx # imm = 0x7FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $524287, %eax # imm = 0x7FFFF
 ; X86-X87-NEXT:    ja .LBB24_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB24_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $28, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -2208,15 +2247,15 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    maxss %xmm1, %xmm0
-; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %ecx
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %ecx
 ; X86-SSE-NEXT:    movl %ecx, %edx
 ; X86-SSE-NEXT:    sarl $31, %edx
-; X86-SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    cvttss2si %xmm0, %eax
+; X86-SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    andl %edx, %eax
 ; X86-SSE-NEXT:    orl %ecx, %eax
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -2257,24 +2296,25 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB25_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB25_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB25_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB25_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $28, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -2299,8 +2339,8 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
 ; X86-SSE-NEXT:    cmovael %edx, %ecx
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %eax
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    addl $12, %esp
 ; X86-SSE-NEXT:    retl
@@ -2377,18 +2417,19 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB26_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB26_8:
 ; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB26_10
 ; X86-X87-NEXT:  # %bb.9:
 ; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB26_10:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $24, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    retl
@@ -2402,10 +2443,10 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ucomiss %xmm2, %xmm0
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jae .LBB26_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -2432,8 +2473,8 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:  .LBB26_4:
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $262143, %edx # imm = 0x3FFFF
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmovbel %eax, %edx
 ; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %esi, %eax
@@ -2521,15 +2562,16 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %edx
 ; X86-X87-NEXT:    ja .LBB27_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %esi, %eax
+; X86-X87-NEXT:    movl %esi, %ecx
 ; X86-X87-NEXT:    movl %edi, %edx
 ; X86-X87-NEXT:  .LBB27_8:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -2543,10 +2585,10 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ucomiss %xmm2, %xmm0
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jae .LBB27_2
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    xorps %xmm2, %xmm2
@@ -2573,8 +2615,8 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:  .LBB27_4:
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %ecx
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmoval %ecx, %edx
 ; X86-SSE-NEXT:    cmoval %ecx, %eax
 ; X86-SSE-NEXT:    addl $28, %esp
@@ -2627,53 +2669,57 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunssfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edi, %edi
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    movl $0, %ecx
 ; X86-X87-NEXT:    jb .LBB28_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:  .LBB28_2:
-; X86-X87-NEXT:    movl $0, %esi
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %edi
 ; X86-X87-NEXT:    jb .LBB28_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB28_4:
 ; X86-X87-NEXT:    jb .LBB28_6
 ; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB28_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $15, %eax
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB28_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB28_8:
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB28_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl %esi, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB28_10:
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
-; X86-X87-NEXT:    movb %al, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB28_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %ebx
+; X86-X87-NEXT:  .LBB28_12:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    movl $15, %edx
+; X86-X87-NEXT:    ja .LBB28_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB28_14:
+; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    movb %dl, 12(%eax)
 ; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -2688,7 +2734,6 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
@@ -2700,35 +2745,35 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    calll __fixunssfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB28_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB28_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movl $15, %ebx
-; X86-SSE-NEXT:    cmovbel %edi, %ebx
-; X86-SSE-NEXT:    movl $-1, %edi
-; X86-SSE-NEXT:    cmoval %edi, %edx
-; X86-SSE-NEXT:    cmoval %edi, %ecx
-; X86-SSE-NEXT:    cmoval %edi, %eax
-; X86-SSE-NEXT:    movl %eax, 8(%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
-; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
-; X86-SSE-NEXT:    movb %bl, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl $15, %edx
+; X86-SSE-NEXT:    cmovbel %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
 ; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -2766,7 +2811,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:    movl %eax, (%esp)
 ; X86-X87-NEXT:    calll __extendhfsf2
@@ -2781,56 +2826,66 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunssfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    jb .LBB29_2
-; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:  .LBB29_2:
 ; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    jb .LBB29_4
-; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jae .LBB29_1
+; X86-X87-NEXT:  # %bb.2:
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jae .LBB29_3
 ; X86-X87-NEXT:  .LBB29_4:
-; X86-X87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB29_6
-; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:  .LBB29_5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB29_6:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB29_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ebx, %eax
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB29_8:
-; X86-X87-NEXT:    movl %esi, 12(%ecx)
-; X86-X87-NEXT:    movl %edi, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 12(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB29_10
+; X86-X87-NEXT:  # %bb.9:
+; X86-X87-NEXT:    movl %edi, %esi
+; X86-X87-NEXT:  .LBB29_10:
+; X86-X87-NEXT:    movl %esi, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB29_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %esi
+; X86-X87-NEXT:  .LBB29_12:
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB29_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %ebx
+; X86-X87-NEXT:  .LBB29_14:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
+; X86-X87-NEXT:  .LBB29_1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jb .LBB29_4
+; X86-X87-NEXT:  .LBB29_3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    jae .LBB29_5
+; X86-X87-NEXT:    jmp .LBB29_6
 ;
 ; X86-SSE-LABEL: test_unsigned_i128_f16:
 ; X86-SSE:       # %bb.0:
@@ -2839,7 +2894,6 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $32, %esp
 ; X86-SSE-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-SSE-NEXT:    movw %ax, (%esp)
 ; X86-SSE-NEXT:    calll __extendhfsf2
@@ -2851,33 +2905,33 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    calll __fixunssfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB29_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB29_2:
-; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movl $-1, %ebx
+; X86-SSE-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 12(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 8(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %ecx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
-; X86-SSE-NEXT:    movl %eax, 12(%esi)
-; X86-SSE-NEXT:    movl %ecx, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %edi, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    movl %ecx, (%eax)
 ; X86-SSE-NEXT:    addl $32, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -2943,21 +2997,22 @@ define i1 @test_unsigned_i1_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB30_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB30_3
 ; X86-X87-NEXT:  .LBB30_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB30_3:
 ; X86-X87-NEXT:    fld1
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB30_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB30_5:
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
@@ -2973,18 +3028,18 @@ define i1 @test_unsigned_i1_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    fldz
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    cmovael %eax, %ecx
+; X86-SSE-NEXT:    cmovbl %eax, %ecx
+; X86-SSE-NEXT:    movl $1, %eax
 ; X86-SSE-NEXT:    fld1
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $1, %eax
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $8, %esp
@@ -3040,21 +3095,22 @@ define i8 @test_unsigned_i8_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB31_1
 ; X86-X87-NEXT:  # %bb.2:
-; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:    jmp .LBB31_3
 ; X86-X87-NEXT:  .LBB31_1:
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:  .LBB31_3:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
-; X86-X87-NEXT:    sahf
+; X86-X87-NEXT:    movl %eax, %ecx
 ; X86-X87-NEXT:    movb $-1, %al
+; X86-X87-NEXT:    movb %ch, %ah
+; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    ja .LBB31_5
 ; X86-X87-NEXT:  # %bb.4:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %eax
 ; X86-X87-NEXT:  .LBB31_5:
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
@@ -3070,18 +3126,18 @@ define i8 @test_unsigned_i8_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fists {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    xorl %ecx, %ecx
+; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    fldz
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    cmovael %eax, %ecx
+; X86-SSE-NEXT:    cmovbl %eax, %ecx
+; X86-SSE-NEXT:    movl $255, %eax
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $255, %eax
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    addl $8, %esp
@@ -3133,25 +3189,25 @@ define i13 @test_unsigned_i13_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB32_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB32_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $8191, %ecx # imm = 0x1FFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $8191, %eax # imm = 0x1FFF
 ; X86-X87-NEXT:    ja .LBB32_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB32_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -3175,11 +3231,11 @@ define i13 @test_unsigned_i13_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:  .LBB32_2:
+; X86-SSE-NEXT:    movl $8191, %eax # imm = 0x1FFF
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $8191, %eax # imm = 0x1FFF
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $8, %esp
@@ -3233,25 +3289,25 @@ define i16 @test_unsigned_i16_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB33_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB33_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; X86-X87-NEXT:    ja .LBB33_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB33_4:
-; X86-X87-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $8, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -3275,11 +3331,11 @@ define i16 @test_unsigned_i16_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:  .LBB33_2:
+; X86-SSE-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $65535, %eax # imm = 0xFFFF
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    addl $8, %esp
@@ -3334,24 +3390,25 @@ define i19 @test_unsigned_i19_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB34_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB34_2:
 ; X86-X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $524287, %ecx # imm = 0x7FFFF
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $524287, %eax # imm = 0x7FFFF
 ; X86-X87-NEXT:    ja .LBB34_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB34_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -3376,11 +3433,11 @@ define i19 @test_unsigned_i19_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:  .LBB34_2:
+; X86-SSE-NEXT:    movl $524287, %eax # imm = 0x7FFFF
 ; X86-SSE-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $524287, %eax # imm = 0x7FFFF
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    addl $20, %esp
 ; X86-SSE-NEXT:    retl
@@ -3434,24 +3491,25 @@ define i32 @test_unsigned_i32_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fucom %st(1)
 ; X86-X87-NEXT:    fstp %st(1)
 ; X86-X87-NEXT:    fnstsw %ax
-; X86-X87-NEXT:    xorl %ecx, %ecx
+; X86-X87-NEXT:    xorl %edx, %edx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
 ; X86-X87-NEXT:    jb .LBB35_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-X87-NEXT:  .LBB35_2:
 ; X86-X87-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB35_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl %ecx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB35_4:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    retl
 ;
@@ -3476,11 +3534,11 @@ define i32 @test_unsigned_i32_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:  # %bb.1:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:  .LBB35_2:
+; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %ecx, %eax
 ; X86-SSE-NEXT:    addl $20, %esp
 ; X86-SSE-NEXT:    retl
@@ -3568,18 +3626,19 @@ define i50 @test_unsigned_i50_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    ja .LBB36_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edx, %eax
+; X86-X87-NEXT:    movl %edx, %ecx
 ; X86-X87-NEXT:  .LBB36_8:
 ; X86-X87-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-X87-NEXT:    ja .LBB36_10
 ; X86-X87-NEXT:  # %bb.9:
 ; X86-X87-NEXT:    movl %esi, %edx
 ; X86-X87-NEXT:  .LBB36_10:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $16, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    retl
@@ -3619,11 +3678,11 @@ define i50 @test_unsigned_i50_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl %eax, %esi
 ; X86-SSE-NEXT:  .LBB36_2:
+; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $-1, %eax
 ; X86-SSE-NEXT:    cmovbel %edx, %eax
 ; X86-SSE-NEXT:    movl $262143, %edx # imm = 0x3FFFF
 ; X86-SSE-NEXT:    cmovbel %esi, %edx
@@ -3722,15 +3781,16 @@ define i64 @test_unsigned_i64_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    fxch %st(1)
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ecx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %edx
 ; X86-X87-NEXT:    ja .LBB37_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %esi, %eax
+; X86-X87-NEXT:    movl %esi, %ecx
 ; X86-X87-NEXT:    movl %edi, %edx
 ; X86-X87-NEXT:  .LBB37_8:
+; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $20, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
@@ -3771,11 +3831,11 @@ define i64 @test_unsigned_i64_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %ecx, %edx
 ; X86-SSE-NEXT:  .LBB37_2:
+; X86-SSE-NEXT:    movl $-1, %ecx
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $-1, %ecx
 ; X86-SSE-NEXT:    cmoval %ecx, %eax
 ; X86-SSE-NEXT:    cmoval %ecx, %edx
 ; X86-SSE-NEXT:    addl $16, %esp
@@ -3827,7 +3887,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fld %st(0)
 ; X86-X87-NEXT:    fstpt {{[0-9]+}}(%esp)
@@ -3842,54 +3902,58 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunsxfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edi, %edi
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
+; X86-X87-NEXT:    movl $0, %ecx
 ; X86-X87-NEXT:    jb .LBB38_2
 ; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:  .LBB38_2:
-; X86-X87-NEXT:    movl $0, %esi
-; X86-X87-NEXT:    movl $0, %ebx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    movl $0, %edi
 ; X86-X87-NEXT:    jb .LBB38_4
 ; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-X87-NEXT:  .LBB38_4:
 ; X86-X87-NEXT:    jb .LBB38_6
 ; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB38_6:
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-X87-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $15, %eax
+; X86-X87-NEXT:    movl $-1, %ebp
 ; X86-X87-NEXT:    ja .LBB38_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %edi, %eax
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB38_8:
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB38_10
 ; X86-X87-NEXT:  # %bb.9:
-; X86-X87-NEXT:    movl %ebx, %edi
-; X86-X87-NEXT:    movl %esi, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-X87-NEXT:    movl %edi, %esi
 ; X86-X87-NEXT:  .LBB38_10:
-; X86-X87-NEXT:    movl %edx, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
-; X86-X87-NEXT:    movb %al, 12(%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB38_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %ebx
+; X86-X87-NEXT:  .LBB38_12:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    movl $15, %edx
+; X86-X87-NEXT:    ja .LBB38_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %edx
+; X86-X87-NEXT:  .LBB38_14:
+; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    movb %dl, 12(%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
@@ -3902,7 +3966,6 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $48, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fld %st(0)
 ; X86-SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
@@ -3912,37 +3975,37 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    calll __fixunsxfti
 ; X86-SSE-NEXT:    subl $4, %esp
 ; X86-SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    fldz
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB38_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB38_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $15, %ebx
-; X86-SSE-NEXT:    cmovbel %edi, %ebx
-; X86-SSE-NEXT:    movl $-1, %edi
-; X86-SSE-NEXT:    cmoval %edi, %edx
-; X86-SSE-NEXT:    cmoval %edi, %ecx
-; X86-SSE-NEXT:    cmoval %edi, %eax
-; X86-SSE-NEXT:    movl %eax, 8(%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
-; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
-; X86-SSE-NEXT:    movb %bl, 12(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 8(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 4(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, (%eax)
+; X86-SSE-NEXT:    movl $15, %edx
+; X86-SSE-NEXT:    cmovbel %ecx, %edx
+; X86-SSE-NEXT:    andl $15, %edx
+; X86-SSE-NEXT:    movb %dl, 12(%eax)
 ; X86-SSE-NEXT:    addl $48, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
@@ -3985,7 +4048,7 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $60, %esp
+; X86-X87-NEXT:    subl $44, %esp
 ; X86-X87-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fld %st(0)
 ; X86-X87-NEXT:    fstpt {{[0-9]+}}(%esp)
@@ -4000,56 +4063,66 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %eax, %ebx
 ; X86-X87-NEXT:    calll __fixunsxfti
 ; X86-X87-NEXT:    subl $4, %esp
-; X86-X87-NEXT:    xorl %edx, %edx
+; X86-X87-NEXT:    xorl %esi, %esi
 ; X86-X87-NEXT:    movb %bh, %ah
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $0, %eax
-; X86-X87-NEXT:    jb .LBB39_2
-; X86-X87-NEXT:  # %bb.1:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-X87-NEXT:  .LBB39_2:
 ; X86-X87-NEXT:    movl $0, %ecx
-; X86-X87-NEXT:    jb .LBB39_4
-; X86-X87-NEXT:  # %bb.3:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl $0, %edx
+; X86-X87-NEXT:    jae .LBB39_1
+; X86-X87-NEXT:  # %bb.2:
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jae .LBB39_3
 ; X86-X87-NEXT:  .LBB39_4:
-; X86-X87-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-X87-NEXT:    movl $0, %ebx
 ; X86-X87-NEXT:    jb .LBB39_6
-; X86-X87-NEXT:  # %bb.5:
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:  .LBB39_5:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-X87-NEXT:  .LBB39_6:
 ; X86-X87-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-X87-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; X86-X87-NEXT:    fucompp
 ; X86-X87-NEXT:    fnstsw %ax
+; X86-X87-NEXT:    movl $-1, %ebx
 ; X86-X87-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-X87-NEXT:    sahf
-; X86-X87-NEXT:    movl $-1, %eax
 ; X86-X87-NEXT:    movl $-1, %ebp
-; X86-X87-NEXT:    movl $-1, %edi
-; X86-X87-NEXT:    movl $-1, %esi
 ; X86-X87-NEXT:    ja .LBB39_8
 ; X86-X87-NEXT:  # %bb.7:
-; X86-X87-NEXT:    movl %ebx, %eax
-; X86-X87-NEXT:    movl %edx, %ebp
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-X87-NEXT:    movl %esi, %ebp
 ; X86-X87-NEXT:  .LBB39_8:
-; X86-X87-NEXT:    movl %esi, 12(%ecx)
-; X86-X87-NEXT:    movl %edi, 8(%ecx)
-; X86-X87-NEXT:    movl %ebp, 4(%ecx)
-; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $60, %esp
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-X87-NEXT:    movl %ebp, 12(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB39_10
+; X86-X87-NEXT:  # %bb.9:
+; X86-X87-NEXT:    movl %edi, %esi
+; X86-X87-NEXT:  .LBB39_10:
+; X86-X87-NEXT:    movl %esi, 8(%eax)
+; X86-X87-NEXT:    movl $-1, %esi
+; X86-X87-NEXT:    ja .LBB39_12
+; X86-X87-NEXT:  # %bb.11:
+; X86-X87-NEXT:    movl %edx, %esi
+; X86-X87-NEXT:  .LBB39_12:
+; X86-X87-NEXT:    movl %esi, 4(%eax)
+; X86-X87-NEXT:    ja .LBB39_14
+; X86-X87-NEXT:  # %bb.13:
+; X86-X87-NEXT:    movl %ecx, %ebx
+; X86-X87-NEXT:  .LBB39_14:
+; X86-X87-NEXT:    movl %ebx, (%eax)
+; X86-X87-NEXT:    addl $44, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
 ; X86-X87-NEXT:    popl %ebp
 ; X86-X87-NEXT:    retl $4
+; X86-X87-NEXT:  .LBB39_1:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-X87-NEXT:    movl $0, %edi
+; X86-X87-NEXT:    jb .LBB39_4
+; X86-X87-NEXT:  .LBB39_3:
+; X86-X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-X87-NEXT:    jae .LBB39_5
+; X86-X87-NEXT:    jmp .LBB39_6
 ;
 ; X86-SSE-LABEL: test_unsigned_i128_f80:
 ; X86-SSE:       # %bb.0:
@@ -4057,7 +4130,6 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $48, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    fld %st(0)
 ; X86-SSE-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
@@ -4067,35 +4139,35 @@ define i128 @test_unsigned_i128_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    calll __fixunsxfti
 ; X86-SSE-NEXT:    subl $4, %esp
 ; X86-SSE-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-SSE-NEXT:    xorl %eax, %eax
+; X86-SSE-NEXT:    xorl %ecx, %ecx
 ; X86-SSE-NEXT:    fldz
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucomi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(1)
-; X86-SSE-NEXT:    movl $0, %ecx
 ; X86-SSE-NEXT:    movl $0, %edx
+; X86-SSE-NEXT:    movl $0, %esi
 ; X86-SSE-NEXT:    movl $0, %edi
 ; X86-SSE-NEXT:    jb .LBB39_2
 ; X86-SSE-NEXT:  # %bb.1:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SSE-NEXT:  .LBB39_2:
+; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    fldt {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE-NEXT:    fxch %st(1)
 ; X86-SSE-NEXT:    fucompi %st(1), %st
 ; X86-SSE-NEXT:    fstp %st(0)
-; X86-SSE-NEXT:    movl $-1, %ebx
 ; X86-SSE-NEXT:    cmoval %ebx, %edi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %edi, 12(%eax)
+; X86-SSE-NEXT:    cmoval %ebx, %esi
+; X86-SSE-NEXT:    movl %esi, 8(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %edx
+; X86-SSE-NEXT:    movl %edx, 4(%eax)
 ; X86-SSE-NEXT:    cmoval %ebx, %ecx
-; X86-SSE-NEXT:    cmoval %ebx, %eax
-; X86-SSE-NEXT:    movl %eax, 12(%esi)
-; X86-SSE-NEXT:    movl %ecx, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 4(%esi)
-; X86-SSE-NEXT:    movl %edi, (%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    movl %ecx, (%eax)
 ; X86-SSE-NEXT:    addl $48, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index 46b2571e196bb..2f5def30ff2b1 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -490,12 +490,11 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
 define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
 ; X86-LABEL: freeze_ashr_exact_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $3, %eax
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    sarl $6, %eax
-; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_exact_extra_use:
@@ -605,12 +604,11 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
 define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
 ; X86-LABEL: freeze_lshr_exact_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    shrl $5, %eax
-; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_exact_extra_use:
@@ -999,17 +997,17 @@ define i32 @freeze_ucmp(i32 %a0) nounwind {
 define void @pr59676_frozen(ptr %dst, i32 %x.orig) {
 ; X86-LABEL: pr59676_frozen:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    imull $84, %eax, %eax
-; X86-NEXT:    movl $818089009, %edx # imm = 0x30C30C31
-; X86-NEXT:    imull %edx
+; X86-NEXT:    movl $818089009, %ecx # imm = 0x30C30C31
+; X86-NEXT:    imull %ecx
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    sarl $3, %edx
 ; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr59676_frozen:
@@ -1037,17 +1035,17 @@ define void @pr59676_frozen(ptr %dst, i32 %x.orig) {
 define void @pr59676_nsw_frozen(ptr %dst, i32 %x.orig) {
 ; X86-LABEL: pr59676_nsw_frozen:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    imull $84, %eax, %eax
-; X86-NEXT:    movl $818089009, %edx # imm = 0x30C30C31
-; X86-NEXT:    imull %edx
+; X86-NEXT:    movl $818089009, %ecx # imm = 0x30C30C31
+; X86-NEXT:    imull %ecx
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    sarl $3, %edx
 ; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr59676_nsw_frozen:
@@ -1076,16 +1074,16 @@ define void @pr59676_nsw(ptr %dst, i32 %x) {
 ; X86-LABEL: pr59676_nsw:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    imull $84, %eax, %eax
-; X86-NEXT:    movl $818089009, %edx # imm = 0x30C30C31
-; X86-NEXT:    imull %edx
+; X86-NEXT:    movl $818089009, %ecx # imm = 0x30C30C31
+; X86-NEXT:    imull %ecx
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    sarl $3, %edx
 ; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr59676_nsw:
diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll
index bc9e29957c74a..51e08a8f5c8cf 100644
--- a/llvm/test/CodeGen/X86/freeze-unary.ll
+++ b/llvm/test/CodeGen/X86/freeze-unary.ll
@@ -209,9 +209,9 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
 define i32 @freeze_ctlz(i32 %a0) nounwind {
 ; X86-LABEL: freeze_ctlz:
 ; X86:       # %bb.0:
-; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $63, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $63, %ecx
+; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    retl
 ;
@@ -231,12 +231,12 @@ define i32 @freeze_ctlz(i32 %a0) nounwind {
 define i32 @freeze_ctlz_undef(i32 %a0) nounwind {
 ; X86-LABEL: freeze_ctlz_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    bsrl %ecx, %edx
+; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    cmovnel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ctlz_undef:
@@ -280,9 +280,9 @@ define i32 @freeze_ctlz_undef_nonzero(i32 %a0) nounwind {
 define i32 @freeze_cttz(i32 %a0) nounwind {
 ; X86-LABEL: freeze_cttz:
 ; X86:       # %bb.0:
-; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_cttz:
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 15b43c41b9945..6e839be7570fb 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -75,8 +75,8 @@ define void @freeze_bitcast_from_wider_elt(ptr %origin, ptr %dst) nounwind {
 ; X86-LABEL: freeze_bitcast_from_wider_elt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -96,10 +96,10 @@ define void @freeze_bitcast_from_wider_elt_escape(ptr %origin, ptr %escape, ptr
 ; X86-LABEL: freeze_bitcast_from_wider_elt_escape:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vmovsd %xmm0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsd %xmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -122,8 +122,8 @@ define void @freeze_bitcast_to_wider_elt(ptr %origin, ptr %dst) nounwind {
 ; X86-LABEL: freeze_bitcast_to_wider_elt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -143,10 +143,10 @@ define void @freeze_bitcast_to_wider_elt_escape(ptr %origin, ptr %escape, ptr %d
 ; X86-LABEL: freeze_bitcast_to_wider_elt_escape:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vmovsd %xmm0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsd %xmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -169,10 +169,10 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-LABEL: freeze_extractelement:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpand (%eax), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -193,16 +193,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ptr %escape) nounwind {
 ; X86-LABEL: freeze_extractelement_escape:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovdqa (%esi), %xmm0
-; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
-; X86-NEXT:    vmovdqa %xmm0, (%ecx)
+; X86-NEXT:    vmovdqa (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpand (%eax), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement_escape:
@@ -228,26 +226,22 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    vmovaps (%eax), %xmm0
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    vandps (%eax), %xmm0, %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    movl 16(%ebp), %ecx
 ; X86-NEXT:    andl $15, %ecx
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    movl 12(%ebp), %esi
-; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    vmovaps (%edi), %xmm0
-; X86-NEXT:    vandps (%esi), %xmm0, %xmm0
-; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movzbl (%esp,%ecx), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    cmpb (%esp,%eax), %cl
 ; X86-NEXT:    sete (%edx)
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -277,10 +271,10 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds
 ; X86-LABEL: freeze_buildvector_single_maybe_poison_operand:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vpinsrd $0, (%ecx), %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $0, (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -308,13 +302,13 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 ; X86-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    andl $15, %ecx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    andl $15, %eax
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; X86-NEXT:    vpinsrd $0, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -342,28 +336,26 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin
 define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst0, ptr %dst1) nounwind {
 ; X86-LABEL: freeze_two_frozen_buildvectors:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %esi
-; X86-NEXT:    andl $15, %esi
-; X86-NEXT:    movl (%edx), %edx
-; X86-NEXT:    andl $15, %edx
-; X86-NEXT:    vmovd %esi, %xmm0
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    andl $15, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [7,7,7,7]
 ; X86-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
-; X86-NEXT:    vmovd %edx, %xmm0
+; X86-NEXT:    andl $15, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
 ; X86-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_two_frozen_buildvectors:
@@ -400,24 +392,22 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst
 define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1, ptr %dst0, ptr %dst1) nounwind {
 ; X86-LABEL: freeze_two_buildvectors_only_one_frozen:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %esi
-; X86-NEXT:    andl $15, %esi
-; X86-NEXT:    vmovd %esi, %xmm0
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    andl $15, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vbroadcastss (%edx), %xmm2
-; X86-NEXT:    vmovdqa %xmm0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vbroadcastss (%eax), %xmm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    vpand %xmm1, %xmm2, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_two_buildvectors_only_one_frozen:
@@ -450,22 +440,20 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1,
 define void @freeze_two_buildvectors_one_undef_elt(ptr %origin0, ptr %origin1, ptr %dst0, ptr %dst1) nounwind {
 ; X86-LABEL: freeze_two_buildvectors_one_undef_elt:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %esi
-; X86-NEXT:    andl $15, %esi
-; X86-NEXT:    vmovd %esi, %xmm0
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    andl $15, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vmovddup {{.*#+}} xmm1 = [7,0,7,0]
 ; X86-NEXT:    # xmm1 = mem[0,0]
 ; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
-; X86-NEXT:    vmovdqa %xmm0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-NEXT:    vpand %xmm1, %xmm2, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_two_buildvectors_one_undef_elt:
@@ -497,21 +485,17 @@ define void @freeze_two_buildvectors_one_undef_elt(ptr %origin0, ptr %origin1, p
 define void @freeze_buildvector(ptr %origin0, ptr %origin1, ptr %origin2, ptr %origin3, ptr %dst) nounwind {
 ; X86-LABEL: freeze_buildvector:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vpinsrd $1, (%esi), %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $2, (%edx), %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $3, (%ecx), %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpinsrd $3, (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_buildvector:
@@ -545,18 +529,16 @@ define void @freeze_buildvector(ptr %origin0, ptr %origin1, ptr %origin2, ptr %o
 define void @freeze_buildvector_one_undef_elt(ptr %origin0, ptr %origin1, ptr %origin2, ptr %origin3, ptr %dst) nounwind {
 ; X86-LABEL: freeze_buildvector_one_undef_elt:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vpinsrd $1, (%edx), %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $3, (%ecx), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpinsrd $3, (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_buildvector_one_undef_elt:
@@ -587,26 +569,20 @@ define void @freeze_buildvector_one_undef_elt(ptr %origin0, ptr %origin1, ptr %o
 define void @freeze_buildvector_extrause(ptr %origin0, ptr %origin1, ptr %origin2, ptr %origin3, ptr %dst, ptr %escape) nounwind {
 ; X86-LABEL: freeze_buildvector_extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vpinsrd $1, (%edi), %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $2, (%esi), %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $3, (%edx), %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpinsrd $3, (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; X86-NEXT:    vmovdqa %xmm1, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqa %xmm1, (%eax)
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_buildvector_extrause:
@@ -646,11 +622,11 @@ define void @pr59677(i32 %x, ptr %out) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    calll sinf
 ; X86-NEXT:    fstps (%esi)
 ; X86-NEXT:    addl $4, %esp
diff --git a/llvm/test/CodeGen/X86/fshl-splat-undef.ll b/llvm/test/CodeGen/X86/fshl-splat-undef.ll
index a2d0345077b87..18b6aa8fb0190 100644
--- a/llvm/test/CodeGen/X86/fshl-splat-undef.ll
+++ b/llvm/test/CodeGen/X86/fshl-splat-undef.ll
@@ -20,10 +20,10 @@
 define void @test_fshl(<8 x i64> %lo, <8 x i64> %hi, ptr %arr) {
 ; CHECK-LABEL: test_fshl:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vpsllq $12, %zmm1, %zmm1
 ; CHECK-NEXT:    vpsrlq $52, %zmm0, %zmm0
-; CHECK-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 9da2640ea8392..5067074dbb022 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -17,11 +17,11 @@ declare i128 @llvm.fshl.i128(i128, i128, i128) nounwind readnone
 define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 ; X86-LABEL: var_shift_i8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    andb $7, %cl
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    movb %ah, %al
@@ -46,20 +46,20 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i16:
 ; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    andb $15, %cl
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    shldw %cl, %dx, %ax
 ; X86-FAST-NEXT:    retl
 ;
 ; X86-SLOW-LABEL: var_shift_i16:
 ; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shll $16, %ecx
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    shll $16, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    andb $15, %cl
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl $16, %eax
@@ -103,11 +103,11 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i32:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    shll %cl, %edx
 ; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    shrl %eax
 ; X86-SLOW-NEXT:    shrl %cl, %eax
 ; X86-SLOW-NEXT:    orl %edx, %eax
@@ -181,9 +181,9 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB5_1
 ; X86-FAST-NEXT:  # %bb.2:
@@ -207,9 +207,9 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    jne .LBB5_1
 ; X86-SLOW-NEXT:  # %bb.2:
@@ -270,55 +270,61 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-NEXT:    pushl %esi
 ; X86-FAST-NEXT:    andl $-16, %esp
 ; X86-FAST-NEXT:    subl $16, %esp
-; X86-FAST-NEXT:    movl 24(%ebp), %edi
-; X86-FAST-NEXT:    movl 28(%ebp), %edx
-; X86-FAST-NEXT:    movl 48(%ebp), %esi
 ; X86-FAST-NEXT:    movl 56(%ebp), %ecx
+; X86-FAST-NEXT:    movl 48(%ebp), %edi
+; X86-FAST-NEXT:    movl 52(%ebp), %esi
+; X86-FAST-NEXT:    movl 24(%ebp), %eax
+; X86-FAST-NEXT:    movl 28(%ebp), %ebx
 ; X86-FAST-NEXT:    testb $64, %cl
-; X86-FAST-NEXT:    movl 52(%ebp), %eax
-; X86-FAST-NEXT:    jne .LBB6_1
+; X86-FAST-NEXT:    movl %esi, %ecx
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    movl %edi, %esi
-; X86-FAST-NEXT:    movl 32(%ebp), %edi
-; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edx, %eax
-; X86-FAST-NEXT:    movl 36(%ebp), %edx
-; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    je .LBB6_5
+; X86-FAST-NEXT:    je .LBB6_3
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-FAST-NEXT:    movl %esi, %eax
-; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-FAST-NEXT:    jmp .LBB6_6
-; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl 44(%ebp), %ebx
-; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl 40(%ebp), %ebx
-; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    jne .LBB6_4
+; X86-FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    je .LBB6_6
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl 44(%ebp), %esi
 ; X86-FAST-NEXT:  .LBB6_6:
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    shldl %cl, %ebx, %edi
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    jne .LBB6_7
+; X86-FAST-NEXT:  # %bb.8:
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %esi
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    jmp .LBB6_9
+; X86-FAST-NEXT:  .LBB6_1:
+; X86-FAST-NEXT:    movl %ebx, %ecx
 ; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    jne .LBB6_4
+; X86-FAST-NEXT:  .LBB6_3:
+; X86-FAST-NEXT:    movl 32(%ebp), %eax
+; X86-FAST-NEXT:    movl 36(%ebp), %ebx
+; X86-FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jne .LBB6_5
+; X86-FAST-NEXT:    jmp .LBB6_6
+; X86-FAST-NEXT:  .LBB6_7:
 ; X86-FAST-NEXT:    movl %eax, %ebx
-; X86-FAST-NEXT:    shldl %cl, %esi, %ebx
 ; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-FAST-NEXT:    movl %eax, %esi
-; X86-FAST-NEXT:    shldl %cl, %edx, %esi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_9:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, %edi
+; X86-FAST-NEXT:    shldl %cl, %edx, %edi
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %edi, 8(%eax)
+; X86-FAST-NEXT:    shldl %cl, %esi, %edx
+; X86-FAST-NEXT:    movl %edx, 4(%eax)
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %edx, %ebx
+; X86-FAST-NEXT:    movl %ebx, 12(%eax)
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-FAST-NEXT:    shldl %cl, %eax, %edx
-; X86-FAST-NEXT:    movl 8(%ebp), %eax
-; X86-FAST-NEXT:    movl %edx, 12(%eax)
-; X86-FAST-NEXT:    movl %esi, 8(%eax)
-; X86-FAST-NEXT:    movl %ebx, 4(%eax)
-; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    shldl %cl, %edx, %esi
+; X86-FAST-NEXT:    movl %esi, (%eax)
 ; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
@@ -335,83 +341,89 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %esi
 ; X86-SLOW-NEXT:    andl $-16, %esp
 ; X86-SLOW-NEXT:    subl $32, %esp
-; X86-SLOW-NEXT:    movl 24(%ebp), %edi
-; X86-SLOW-NEXT:    movl 28(%ebp), %eax
-; X86-SLOW-NEXT:    movl 48(%ebp), %edx
-; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    testb $64, %cl
-; X86-SLOW-NEXT:    movl 52(%ebp), %ebx
-; X86-SLOW-NEXT:    jne .LBB6_1
-; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %edx
-; X86-SLOW-NEXT:    movl 32(%ebp), %edi
-; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %eax, %ebx
+; X86-SLOW-NEXT:    movl 56(%ebp), %eax
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
+; X86-SLOW-NEXT:    movl 28(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    jne .LBB6_2
+; X86-SLOW-NEXT:  # %bb.1:
+; X86-SLOW-NEXT:    movl 32(%ebp), %edx
 ; X86-SLOW-NEXT:    movl 36(%ebp), %eax
-; X86-SLOW-NEXT:    jmp .LBB6_3
-; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
-; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
-; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB6_3:
-; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    testb $32, %cl
+; X86-SLOW-NEXT:  .LBB6_2:
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 48(%ebp), %esi
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
+; X86-SLOW-NEXT:    movl %edi, %ebx
+; X86-SLOW-NEXT:    movl %esi, %edx
 ; X86-SLOW-NEXT:    jne .LBB6_4
-; X86-SLOW-NEXT:  # %bb.5:
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  # %bb.3:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
 ; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    je .LBB6_6
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl 40(%ebp), %esi
+; X86-SLOW-NEXT:    movl 44(%ebp), %edi
+; X86-SLOW-NEXT:  .LBB6_6:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $32, %cl
+; X86-SLOW-NEXT:    jne .LBB6_7
+; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edx, %ebx
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    jmp .LBB6_9
+; X86-SLOW-NEXT:  .LBB6_7:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %esi
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_9:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
 ; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    shrl %esi
-; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    movl %ebx, %edx
 ; X86-SLOW-NEXT:    notb %dl
 ; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shrl %cl, %esi
-; X86-SLOW-NEXT:    orl %eax, %esi
-; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    orl %eax, %edi
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
+; X86-SLOW-NEXT:    movl %edi, 12(%eax)
+; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    movl %ebx, %eax
-; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, %edi
 ; X86-SLOW-NEXT:    shrl %edi
 ; X86-SLOW-NEXT:    movl %edx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    orl %eax, %edi
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SLOW-NEXT:    movl %esi, %eax
-; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    shrl %ebx
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    movl 8(%ebp), %esi
+; X86-SLOW-NEXT:    movl %edi, 8(%esi)
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    shrl %edi
 ; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebx
-; X86-SLOW-NEXT:    orl %eax, %ebx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    orl %ebx, %edi
+; X86-SLOW-NEXT:    movl %edi, 4(%esi)
+; X86-SLOW-NEXT:    movl %esi, %edi
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-SLOW-NEXT:    shrl %esi
 ; X86-SLOW-NEXT:    movl %edx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %esi
 ; X86-SLOW-NEXT:    orl %eax, %esi
-; X86-SLOW-NEXT:    movl 8(%ebp), %eax
-; X86-SLOW-NEXT:    movl %esi, 12(%eax)
-; X86-SLOW-NEXT:    movl %ebx, 8(%eax)
-; X86-SLOW-NEXT:    movl %edi, 4(%eax)
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SLOW-NEXT:    movl %ecx, (%eax)
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    movl %esi, (%edi)
 ; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
@@ -466,9 +478,9 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 define i8 @const_shift_i8(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: const_shift_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $7, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    retl
@@ -496,9 +508,9 @@ define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
 ;
 ; X86-SLOW-LABEL: const_shift_i16:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    shrl $9, %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    shll $7, %eax
 ; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -533,9 +545,9 @@ define i32 @const_shift_i32(i32 %x, i32 %y) nounwind {
 ;
 ; X86-SLOW-LABEL: const_shift_i32:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    shrl $25, %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    shll $7, %eax
 ; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    retl
@@ -570,18 +582,16 @@ define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
 ;
 ; X86-SLOW-LABEL: const_shift_i64:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl %ecx, %esi
-; X86-SLOW-NEXT:    shrl $25, %esi
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    shrl $25, %eax
 ; X86-SLOW-NEXT:    shll $7, %edx
-; X86-SLOW-NEXT:    orl %esi, %edx
+; X86-SLOW-NEXT:    orl %eax, %edx
 ; X86-SLOW-NEXT:    shll $7, %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    shrl $25, %eax
 ; X86-SLOW-NEXT:    orl %ecx, %eax
-; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    retl
 ;
 ; X64-FAST-LABEL: const_shift_i64:
@@ -682,10 +692,10 @@ define i64 @combine_fshl_load_i64(ptr %p) nounwind {
 define i16 @combine_fshl_load_i16_chain_use(ptr %p, ptr %q, i16 %r) nounwind {
 ; X86-LABEL: combine_fshl_load_i16_chain_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl 1(%eax), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movw %cx, (%edx)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index c307833e488c9..f337646407d76 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -17,11 +17,11 @@ declare i128 @llvm.fshr.i128(i128, i128, i128) nounwind readnone
 define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 ; X86-LABEL: var_shift_i8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    andb $7, %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -45,20 +45,20 @@ define i8 @var_shift_i8(i8 %x, i8 %y, i8 %z) nounwind {
 define i16 @var_shift_i16(i16 %x, i16 %y, i16 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i16:
 ; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    andb $15, %cl
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    shrdw %cl, %dx, %ax
 ; X86-FAST-NEXT:    retl
 ;
 ; X86-SLOW-LABEL: var_shift_i16:
 ; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shll $16, %ecx
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    shll $16, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    andb $15, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -100,11 +100,11 @@ define i32 @var_shift_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X86-SLOW-LABEL: var_shift_i32:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    shrl %cl, %edx
 ; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    addl %eax, %eax
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    orl %edx, %eax
@@ -177,9 +177,9 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB5_1
 ; X86-FAST-NEXT:  # %bb.2:
@@ -201,9 +201,9 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    je .LBB5_1
 ; X86-SLOW-NEXT:  # %bb.2:
@@ -264,46 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-NEXT:    pushl %esi
 ; X86-FAST-NEXT:    andl $-16, %esp
 ; X86-FAST-NEXT:    subl $16, %esp
-; X86-FAST-NEXT:    movl 24(%ebp), %esi
-; X86-FAST-NEXT:    movl 28(%ebp), %eax
+; X86-FAST-NEXT:    movl 56(%ebp), %eax
 ; X86-FAST-NEXT:    movl 48(%ebp), %edx
-; X86-FAST-NEXT:    movl 56(%ebp), %ecx
-; X86-FAST-NEXT:    testb $64, %cl
 ; X86-FAST-NEXT:    movl 52(%ebp), %ebx
-; X86-FAST-NEXT:    je .LBB6_1
+; X86-FAST-NEXT:    movl 24(%ebp), %edi
+; X86-FAST-NEXT:    movl 28(%ebp), %esi
+; X86-FAST-NEXT:    testb $64, %al
+; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %eax
+; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:    movl 32(%ebp), %esi
-; X86-FAST-NEXT:    movl %ebx, %edi
-; X86-FAST-NEXT:    movl %eax, %ebx
-; X86-FAST-NEXT:    movl 36(%ebp), %eax
+; X86-FAST-NEXT:    jne .LBB6_3
+; X86-FAST-NEXT:  .LBB6_4:
+; X86-FAST-NEXT:    jne .LBB6_6
+; X86-FAST-NEXT:  .LBB6_5:
+; X86-FAST-NEXT:    movl 40(%ebp), %edx
+; X86-FAST-NEXT:    movl 44(%ebp), %ebx
+; X86-FAST-NEXT:  .LBB6_6:
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    je .LBB6_4
-; X86-FAST-NEXT:    jmp .LBB6_5
+; X86-FAST-NEXT:    je .LBB6_7
+; X86-FAST-NEXT:  # %bb.8:
+; X86-FAST-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-FAST-NEXT:    jmp .LBB6_9
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, %eax
+; X86-FAST-NEXT:    je .LBB6_4
+; X86-FAST-NEXT:  .LBB6_3:
+; X86-FAST-NEXT:    movl 32(%ebp), %edi
+; X86-FAST-NEXT:    movl 36(%ebp), %esi
+; X86-FAST-NEXT:    je .LBB6_5
+; X86-FAST-NEXT:    jmp .LBB6_6
+; X86-FAST-NEXT:  .LBB6_7:
 ; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-FAST-NEXT:    movl 44(%ebp), %edi
-; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    jne .LBB6_5
-; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %esi, %eax
-; X86-FAST-NEXT:    movl %ebx, %esi
+; X86-FAST-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %ebx, %edx
+; X86-FAST-NEXT:  .LBB6_9:
+; X86-FAST-NEXT:    movl %esi, %ebx
+; X86-FAST-NEXT:    shrdl %cl, %edi, %ebx
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %ebx, 8(%eax)
 ; X86-FAST-NEXT:    movl %edx, %ebx
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-FAST-NEXT:    shrdl %cl, %esi, %ebx
+; X86-FAST-NEXT:    movl %ebx, 4(%eax)
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-FAST-NEXT:    shrdl %cl, %esi, %edi
+; X86-FAST-NEXT:    movl %edi, 12(%eax)
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
-; X86-FAST-NEXT:    movl 8(%ebp), %eax
-; X86-FAST-NEXT:    movl %esi, 12(%eax)
-; X86-FAST-NEXT:    movl %ebx, 8(%eax)
-; X86-FAST-NEXT:    movl %edx, 4(%eax)
-; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-FAST-NEXT:    shrdl %cl, %edx, %esi
+; X86-FAST-NEXT:    movl %esi, (%eax)
 ; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
@@ -319,82 +335,93 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
 ; X86-SLOW-NEXT:    andl $-16, %esp
-; X86-SLOW-NEXT:    subl $16, %esp
-; X86-SLOW-NEXT:    movl 24(%ebp), %edx
-; X86-SLOW-NEXT:    movl 28(%ebp), %esi
-; X86-SLOW-NEXT:    movl 48(%ebp), %edi
-; X86-SLOW-NEXT:    movl 56(%ebp), %eax
-; X86-SLOW-NEXT:    testb $64, %al
-; X86-SLOW-NEXT:    movl 52(%ebp), %eax
-; X86-SLOW-NEXT:    je .LBB6_1
+; X86-SLOW-NEXT:    subl $32, %esp
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
+; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 28(%ebp), %edx
+; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    movl %edi, %ecx
+; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl 32(%ebp), %edx
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    movl %esi, %eax
-; X86-SLOW-NEXT:    movl 36(%ebp), %esi
-; X86-SLOW-NEXT:    jmp .LBB6_3
-; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
-; X86-SLOW-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
-; X86-SLOW-NEXT:  .LBB6_3:
-; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
-; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB6_4
-; X86-SLOW-NEXT:  # %bb.5:
-; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    jne .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %edx, %esi
-; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ecx, %edi
-; X86-SLOW-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jne .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_5:
+; X86-SLOW-NEXT:    movl 40(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 44(%ebp), %edi
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebx
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    notb %dl
-; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:    addl %edi, %edi
+; X86-SLOW-NEXT:    testb $32, %cl
+; X86-SLOW-NEXT:    je .LBB6_7
+; X86-SLOW-NEXT:  # %bb.8:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %esi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_9
+; X86-SLOW-NEXT:  .LBB6_1:
+; X86-SLOW-NEXT:    movl %esi, %eax
 ; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    orl %ebx, %edi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    je .LBB6_4
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 32(%ebp), %esi
+; X86-SLOW-NEXT:    movl 36(%ebp), %edx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    je .LBB6_5
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_7:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, %esi
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_9:
+; X86-SLOW-NEXT:    movl %esi, %edx
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %eax
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebx
-; X86-SLOW-NEXT:    orl %eax, %ebx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    addl %eax, %eax
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    orl %edx, %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %eax, 4(%ecx)
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%edx,%edx), %eax
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    orl %edi, %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %edi
+; X86-SLOW-NEXT:    movl %eax, 8(%edi)
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%eax,%eax), %edi
-; X86-SLOW-NEXT:    movl %edx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edi
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    addl %eax, %eax
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    orl %edx, %eax
+; X86-SLOW-NEXT:    movl %eax, 12(%edi)
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shrl %cl, %eax
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %eax, %esi
-; X86-SLOW-NEXT:    movl 8(%ebp), %eax
-; X86-SLOW-NEXT:    movl %esi, 12(%eax)
-; X86-SLOW-NEXT:    movl %edi, 8(%eax)
-; X86-SLOW-NEXT:    movl %ebx, 4(%eax)
-; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-SLOW-NEXT:    movl %ecx, (%eax)
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    movl %esi, (%edi)
 ; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
@@ -447,9 +474,9 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 define i8 @const_shift_i8(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: const_shift_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrb $7, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    retl
@@ -476,9 +503,9 @@ define i16 @const_shift_i16(i16 %x, i16 %y) nounwind {
 ;
 ; X86-SLOW-LABEL: const_shift_i16:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    shrl $7, %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    shll $9, %eax
 ; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -513,9 +540,9 @@ define i32 @const_shift_i32(i32 %x, i32 %y) nounwind {
 ;
 ; X86-SLOW-LABEL: const_shift_i32:
 ; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    shrl $7, %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    shll $25, %eax
 ; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    retl
@@ -551,7 +578,6 @@ define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
 ; X86-SLOW-LABEL: const_shift_i64:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    shrl $7, %ecx
@@ -559,6 +585,7 @@ define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
 ; X86-SLOW-NEXT:    shll $25, %eax
 ; X86-SLOW-NEXT:    orl %ecx, %eax
 ; X86-SLOW-NEXT:    shrl $7, %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    shll $25, %edx
 ; X86-SLOW-NEXT:    orl %esi, %edx
 ; X86-SLOW-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/fsxor-alignment.ll b/llvm/test/CodeGen/X86/fsxor-alignment.ll
index 32af5b969b6d5..58e241a4e43f3 100644
--- a/llvm/test/CodeGen/X86/fsxor-alignment.ll
+++ b/llvm/test/CodeGen/X86/fsxor-alignment.ll
@@ -8,16 +8,14 @@
 define void @foo(ptr %p, ptr %q, float %s, float %y) nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    xorl %edx, %esi
-; CHECK-NEXT:    movl %esi, (%ecx)
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl %edx, (%eax)
-; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    xorl %eax, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %ecx, (%edx)
+; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    retl
   %ss = fsub float -0.0, %s
   %yy = fsub float -0.0, %y
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 9095fb1550e70..f6220ffb57c77 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -697,8 +697,8 @@ define float @trunc_unsigned_f32_disable_via_intrinsic(float %x) #0 {
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vucomiss %xmm1, %xmm0
 ; X86-AVX1-NEXT:    cmovael %edx, %eax
-; X86-AVX1-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-AVX1-NEXT:    movl $-1, %ecx
+; X86-AVX1-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-AVX1-NEXT:    cmovbel %eax, %ecx
 ; X86-AVX1-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
@@ -751,16 +751,16 @@ define double @trunc_signed_f64_disable_via_intrinsic(double %x) #0 {
 ; X86-AVX1-NEXT:    fldl (%esp)
 ; X86-AVX1-NEXT:    fisttpll (%esp)
 ; X86-AVX1-NEXT:    xorl %eax, %eax
-; X86-AVX1-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-AVX1-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-AVX1-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-AVX1-NEXT:    movl $0, %edx
 ; X86-AVX1-NEXT:    jb .LBB19_2
 ; X86-AVX1-NEXT:  # %bb.1:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl (%esp), %edx
 ; X86-AVX1-NEXT:  .LBB19_2:
-; X86-AVX1-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-AVX1-NEXT:    movl $-1, %esi
+; X86-AVX1-NEXT:    vucomisd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-AVX1-NEXT:    cmovbel %edx, %esi
 ; X86-AVX1-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-AVX1-NEXT:    cmovbel %ecx, %edx
diff --git a/llvm/test/CodeGen/X86/funnel-shift-rot.ll b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
index 7d106fce44555..b5272199271df 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
@@ -277,9 +277,9 @@ define i64 @rotr_i64(i64 %x, i64 %z) nounwind {
 ; X86-SSE2-LABEL: rotr_i64:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
 ; X86-SSE2-NEXT:    movl %eax, %edx
 ; X86-SSE2-NEXT:    cmovel %esi, %edx
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 318a48b92cb28..d9598d0067fb8 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -43,9 +43,9 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
 ; X86-SSE2-NEXT:    movl %edx, %edi
 ; X86-SSE2-NEXT:    cmovnel %esi, %edi
@@ -80,50 +80,49 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    movl 48(%ebp), %edi
-; X86-SSE2-NEXT:    movl 52(%ebp), %eax
+; X86-SSE2-NEXT:    movl 52(%ebp), %esi
 ; X86-SSE2-NEXT:    movl 24(%ebp), %edx
-; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $64, %cl
 ; X86-SSE2-NEXT:    movl %edx, %ecx
 ; X86-SSE2-NEXT:    cmovnel %edi, %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%ebp), %esi
-; X86-SSE2-NEXT:    movl %esi, %ebx
-; X86-SSE2-NEXT:    cmovnel %eax, %ebx
-; X86-SSE2-NEXT:    cmovnel 44(%ebp), %eax
-; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
-; X86-SSE2-NEXT:    cmovel 36(%ebp), %esi
+; X86-SSE2-NEXT:    movl 28(%ebp), %ebx
+; X86-SSE2-NEXT:    movl %ebx, %eax
+; X86-SSE2-NEXT:    cmovnel %esi, %eax
+; X86-SSE2-NEXT:    cmovel 36(%ebp), %ebx
 ; X86-SSE2-NEXT:    cmovel 32(%ebp), %edx
+; X86-SSE2-NEXT:    cmovnel 44(%ebp), %esi
+; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
 ; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    cmovnel %edx, %esi
-; X86-SSE2-NEXT:    cmovnel %ebx, %edx
+; X86-SSE2-NEXT:    cmovnel %edx, %ebx
+; X86-SSE2-NEXT:    cmovnel %eax, %edx
 ; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    cmovnel %ecx, %ebx
-; X86-SSE2-NEXT:    cmovel %eax, %edi
-; X86-SSE2-NEXT:    cmovel %ecx, %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    cmovnel %ecx, %eax
+; X86-SSE2-NEXT:    cmovel %esi, %edi
+; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    cmovel %ecx, %esi
+; X86-SSE2-NEXT:    movl %edx, %edi
 ; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-SSE2-NEXT:    movl %ebx, %edi
+; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
+; X86-SSE2-NEXT:    movl 8(%ebp), %ecx
+; X86-SSE2-NEXT:    movl %edi, 8(%ecx)
+; X86-SSE2-NEXT:    movl %ecx, %edi
 ; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
-; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl %edx, %edi
+; X86-SSE2-NEXT:    shldl %cl, %esi, %eax
+; X86-SSE2-NEXT:    movl %eax, 4(%edi)
 ; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %edi
-; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    shldl %cl, %edx, %ebx
+; X86-SSE2-NEXT:    movl %ebx, 12(%edi)
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %edx, %esi
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    shldl %cl, %eax, %esi
+; X86-SSE2-NEXT:    movl %esi, (%edi)
+; X86-SSE2-NEXT:    movl %edi, %eax
 ; X86-SSE2-NEXT:    leal -12(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
@@ -152,40 +151,33 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
 define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X86-SSE2-LABEL: fshl_i37:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    andl $31, %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $31, %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    shldl $27, %ebx, %edi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    shldl $27, %edi, %esi
+; X86-SSE2-NEXT:    shll $27, %edi
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $37
+; X86-SSE2-NEXT:    pushl %ecx
 ; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    calll __umoddi3
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    jne .LBB3_1
-; X86-SSE2-NEXT:  # %bb.2:
-; X86-SSE2-NEXT:    movl %edi, %ebx
-; X86-SSE2-NEXT:    movl %esi, %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    jmp .LBB3_3
-; X86-SSE2-NEXT:  .LBB3_1:
-; X86-SSE2-NEXT:    shll $27, %ebx
-; X86-SSE2-NEXT:  .LBB3_3:
-; X86-SSE2-NEXT:    movl %edi, %eax
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %eax
+; X86-SSE2-NEXT:    cmovel %esi, %edi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    cmovel %edx, %esi
+; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %esi, %eax
+; X86-SSE2-NEXT:    shldl %cl, %edi, %eax
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %edi, %esi
-; X86-SSE2-NEXT:    movl %esi, %edx
+; X86-SSE2-NEXT:    shldl %cl, %esi, %edx
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fshl_i37:
@@ -318,41 +310,40 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
 define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X86-SSE2-LABEL: fshr_i37:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    andl $31, %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $31, %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    shldl $27, %ebx, %esi
+; X86-SSE2-NEXT:    shldl $27, %edi, %esi
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $37
+; X86-SSE2-NEXT:    pushl %ecx
 ; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    calll __umoddi3
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, %ecx
 ; X86-SSE2-NEXT:    addl $27, %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    testb $32, %cl
 ; X86-SSE2-NEXT:    je .LBB10_1
 ; X86-SSE2-NEXT:  # %bb.2:
-; X86-SSE2-NEXT:    movl %edi, %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT:    movl %eax, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    jmp .LBB10_3
 ; X86-SSE2-NEXT:  .LBB10_1:
-; X86-SSE2-NEXT:    shll $27, %ebx
+; X86-SSE2-NEXT:    shll $27, %edi
 ; X86-SSE2-NEXT:    movl %esi, %edx
-; X86-SSE2-NEXT:    movl %ebx, %esi
+; X86-SSE2-NEXT:    movl %edi, %esi
 ; X86-SSE2-NEXT:  .LBB10_3:
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shrdl %cl, %edi, %edx
+; X86-SSE2-NEXT:    shrdl %cl, %eax, %edx
 ; X86-SSE2-NEXT:    movl %esi, %eax
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fshr_i37:
@@ -450,9 +441,9 @@ define i32 @fshl_i32_undef0(i32 %a0, i32 %a1) nounwind {
 define i32 @fshl_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef0_msk:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    andl $7, %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
@@ -535,9 +526,9 @@ define i32 @fshl_i32_undef1(i32 %a0, i32 %a1) nounwind {
 define i32 @fshl_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef1_msk:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    andb $7, %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    shll %cl, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -628,9 +619,9 @@ define i32 @fshr_i32_undef0(i32 %a0, i32 %a1) nounwind {
 define i32 @fshr_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef0_msk:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    andb $7, %cl
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    shrl %cl, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -707,9 +698,9 @@ define i32 @fshr_i32_undef1(i32 %a0, i32 %a1) nounwind {
 define i32 @fshr_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef1_msk:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    andl $7, %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
@@ -825,8 +816,8 @@ define i32 @fshl_i32_zero1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_zero1:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -862,8 +853,8 @@ define i32 @fshr_i32_zero0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_zero0:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    xorl %edx, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1085,34 +1076,37 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw
 define void @PR45265(i32 %0, ptr nocapture readonly %1) nounwind {
 ; X86-SSE2-LABEL: PR45265:
 ; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    leal (%eax,%eax,2), %esi
-; X86-SSE2-NEXT:    movzwl 8(%ecx,%esi,4), %edx
-; X86-SSE2-NEXT:    movl 4(%ecx,%esi,4), %edi
-; X86-SSE2-NEXT:    shrdl $8, %edx, %edi
-; X86-SSE2-NEXT:    xorl %eax, %edi
-; X86-SSE2-NEXT:    sarl $31, %eax
-; X86-SSE2-NEXT:    movzbl 10(%ecx,%esi,4), %ecx
+; X86-SSE2-NEXT:    leal (%eax,%eax,2), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE2-NEXT:    movzwl 8(%edi,%edx,4), %esi
+; X86-SSE2-NEXT:    movzbl 10(%edi,%edx,4), %ecx
 ; X86-SSE2-NEXT:    shll $16, %ecx
-; X86-SSE2-NEXT:    orl %edx, %ecx
+; X86-SSE2-NEXT:    orl %esi, %ecx
 ; X86-SSE2-NEXT:    shll $8, %ecx
-; X86-SSE2-NEXT:    movl %ecx, %edx
-; X86-SSE2-NEXT:    sarl $8, %edx
+; X86-SSE2-NEXT:    movl %ecx, %ebx
+; X86-SSE2-NEXT:    sarl $8, %ebx
 ; X86-SSE2-NEXT:    sarl $31, %ecx
-; X86-SSE2-NEXT:    shldl $24, %edx, %ecx
-; X86-SSE2-NEXT:    xorl %eax, %ecx
-; X86-SSE2-NEXT:    orl %ecx, %edi
+; X86-SSE2-NEXT:    shldl $24, %ebx, %ecx
+; X86-SSE2-NEXT:    movl 4(%edi,%edx,4), %edx
+; X86-SSE2-NEXT:    shrdl $8, %esi, %edx
+; X86-SSE2-NEXT:    xorl %eax, %edx
+; X86-SSE2-NEXT:    sarl $31, %eax
+; X86-SSE2-NEXT:    xorl %ecx, %eax
+; X86-SSE2-NEXT:    orl %eax, %edx
 ; X86-SSE2-NEXT:    jne .LBB50_1
 ; X86-SSE2-NEXT:  # %bb.2:
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    jmp _Z3foov # TAILCALL
 ; X86-SSE2-NEXT:  .LBB50_1:
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR45265:
@@ -1150,10 +1144,10 @@ declare dso_local void @_Z3foov()
 define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_shl_fshl:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1174,10 +1168,10 @@ define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_shl_rotl:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    shll %cl, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    roll %cl, %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
@@ -1200,10 +1194,10 @@ define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_shl_fshl_commute:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1224,11 +1218,11 @@ define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_shl_rotl_commute:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    shll %cl, %edx
-; X86-SSE2-NEXT:    roll %cl, %eax
+; X86-SSE2-NEXT:    roll %cl, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    shll %cl, %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1250,10 +1244,10 @@ define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_lshr_fshr:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1274,10 +1268,10 @@ define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_lshr_rotr:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    shrl %cl, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    rorl %cl, %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
@@ -1300,10 +1294,10 @@ define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_lshr_fshr_commute:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1324,11 +1318,11 @@ define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-LABEL: or_lshr_rotr_commute:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    shrl %cl, %edx
-; X86-SSE2-NEXT:    rorl %cl, %eax
+; X86-SSE2-NEXT:    rorl %cl, %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    shrl %cl, %eax
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll
index a67ce8f0be5b0..758470464a4b0 100644
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@@ -90,23 +90,19 @@ define <4 x double> @foo(ptr %p, ptr %i, ptr %h) nounwind {
 ;
 ; LIN32-LABEL: foo:
 ; LIN32:       # %bb.0:
-; LIN32-NEXT:    pushl %edi
-; LIN32-NEXT:    pushl %esi
 ; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; LIN32-NEXT:    movdqa (%ecx), %xmm0
-; LIN32-NEXT:    pand (%eax), %xmm0
+; LIN32-NEXT:    movdqa (%eax), %xmm2
 ; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; LIN32-NEXT:    movd %xmm0, %ecx
-; LIN32-NEXT:    pextrd $1, %xmm0, %edx
-; LIN32-NEXT:    pextrd $2, %xmm0, %esi
-; LIN32-NEXT:    pextrd $3, %xmm0, %edi
+; LIN32-NEXT:    pand (%eax), %xmm2
+; LIN32-NEXT:    movd %xmm2, %eax
+; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; LIN32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; LIN32-NEXT:    pextrd $1, %xmm2, %eax
 ; LIN32-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; LIN32-NEXT:    pextrd $2, %xmm2, %eax
 ; LIN32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; LIN32-NEXT:    pextrd $3, %xmm2, %eax
 ; LIN32-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; LIN32-NEXT:    popl %esi
-; LIN32-NEXT:    popl %edi
 ; LIN32-NEXT:    retl
   %a = load <4 x i32>, ptr %i
   %b = load <4 x i32>, ptr %h
@@ -222,29 +218,25 @@ define <4 x i64> @old(ptr %p, ptr %i, ptr %h, i64 %f) nounwind {
 ;
 ; LIN32-LABEL: old:
 ; LIN32:       # %bb.0:
-; LIN32-NEXT:    pushl %edi
-; LIN32-NEXT:    pushl %esi
 ; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; LIN32-NEXT:    movdqa (%ecx), %xmm0
-; LIN32-NEXT:    pand (%eax), %xmm0
-; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; LIN32-NEXT:    movd %xmm0, %edx
-; LIN32-NEXT:    pextrd $1, %xmm0, %esi
-; LIN32-NEXT:    pextrd $2, %xmm0, %eax
-; LIN32-NEXT:    pextrd $3, %xmm0, %edi
-; LIN32-NEXT:    andl %ecx, %edx
-; LIN32-NEXT:    andl %ecx, %esi
-; LIN32-NEXT:    andl %ecx, %eax
-; LIN32-NEXT:    andl %ecx, %edi
-; LIN32-NEXT:    movd %esi, %xmm1
-; LIN32-NEXT:    movd %edx, %xmm0
-; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN32-NEXT:    movd %edi, %xmm2
-; LIN32-NEXT:    movd %eax, %xmm1
+; LIN32-NEXT:    movdqa (%eax), %xmm1
+; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LIN32-NEXT:    pand (%eax), %xmm1
+; LIN32-NEXT:    pextrd $1, %xmm1, %ecx
+; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LIN32-NEXT:    andl %eax, %ecx
+; LIN32-NEXT:    movd %ecx, %xmm2
+; LIN32-NEXT:    movd %xmm1, %ecx
+; LIN32-NEXT:    andl %eax, %ecx
+; LIN32-NEXT:    movd %ecx, %xmm0
+; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; LIN32-NEXT:    pextrd $3, %xmm1, %ecx
+; LIN32-NEXT:    andl %eax, %ecx
+; LIN32-NEXT:    movd %ecx, %xmm2
+; LIN32-NEXT:    pextrd $2, %xmm1, %ecx
+; LIN32-NEXT:    andl %eax, %ecx
+; LIN32-NEXT:    movd %ecx, %xmm1
 ; LIN32-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; LIN32-NEXT:    popl %esi
-; LIN32-NEXT:    popl %edi
 ; LIN32-NEXT:    retl
   %a = load <4 x i32>, ptr %i
   %b = load <4 x i32>, ptr %h
diff --git a/llvm/test/CodeGen/X86/ghc-cc.ll b/llvm/test/CodeGen/X86/ghc-cc.ll
index 814de5a4a82c6..f0314fbef015f 100644
--- a/llvm/test/CodeGen/X86/ghc-cc.ll
+++ b/llvm/test/CodeGen/X86/ghc-cc.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -tailcallopt -mtriple=i686-linux-gnu | FileCheck %s
 
 ; Test the GHC call convention works (x86-32)
@@ -8,35 +9,55 @@
 @r1   = external dso_local global i32 ; assigned to register: esi
 
 define void @zap(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: zap:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    calll addtwo at PLT
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    calll foo at PLT
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
-  ; CHECK: movl {{[0-9]*}}(%esp), %ebx
-  ; CHECK-NEXT: movl {{[0-9]*}}(%esp), %ebp
-  ; CHECK-NEXT: calll addtwo
   %0 = call ghccc i32 @addtwo(i32 %a, i32 %b)
-  ; CHECK: calll foo
   call void @foo() nounwind
   ret void
 }
 
 define ghccc i32 @addtwo(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: addtwo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    leal (%ebx,%ebp), %eax
+; CHECK-NEXT:    retl $12
 entry:
-  ; CHECK: leal (%ebx,%ebp), %eax
   %0 = add i32 %x, %y
-  ; CHECK-NEXT: ret
   ret i32 %0
 }
 
 define ghccc void @foo() nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    movl base, %ebx
+; CHECK-NEXT:    movl sp, %ebp
+; CHECK-NEXT:    movl hp, %edi
+; CHECK-NEXT:    movl r1, %esi
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    jmp bar at PLT # TAILCALL
 entry:
-  ; CHECK:      movl r1, %esi
-  ; CHECK-NEXT: movl hp, %edi
-  ; CHECK-NEXT: movl sp, %ebp
-  ; CHECK-NEXT: movl base, %ebx
   %0 = load i32, ptr @r1
   %1 = load i32, ptr @hp
   %2 = load i32, ptr @sp
   %3 = load i32, ptr @base
-  ; CHECK: jmp bar
   tail call ghccc void @bar( i32 %3, i32 %2, i32 %1, i32 %0 ) nounwind
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/gpr-to-mask.ll b/llvm/test/CodeGen/X86/gpr-to-mask.ll
index d1513b584887f..e5e15adfff248 100644
--- a/llvm/test/CodeGen/X86/gpr-to-mask.ll
+++ b/llvm/test/CodeGen/X86/gpr-to-mask.ll
@@ -514,17 +514,17 @@ define void @test_add(i1 %cond, ptr %ptr1, ptr %ptr2, <8 x float> %fvec1, <8 x f
 ; X86-32-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X86-32-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-32-NEXT:    kmovb (%edx), %k0
-; X86-32-NEXT:    kmovb (%ecx), %k1
+; X86-32-NEXT:    kmovb (%eax), %k0
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    kmovb (%eax), %k1
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-32-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-32-NEXT:    je .LBB9_2
 ; X86-32-NEXT:  # %bb.1: # %if
-; X86-32-NEXT:    kandb %k1, %k0, %k1
+; X86-32-NEXT:    kandb %k0, %k1, %k1
 ; X86-32-NEXT:    jmp .LBB9_3
 ; X86-32-NEXT:  .LBB9_2: # %else
-; X86-32-NEXT:    kaddb %k1, %k0, %k1
+; X86-32-NEXT:    kaddb %k0, %k1, %k1
 ; X86-32-NEXT:  .LBB9_3: # %exit
 ; X86-32-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
 ; X86-32-NEXT:    vmovaps %ymm1, (%eax)
diff --git a/llvm/test/CodeGen/X86/h-registers-2.ll b/llvm/test/CodeGen/X86/h-registers-2.ll
index 5c42c97e7a43e..5baf51ed1c57e 100644
--- a/llvm/test/CodeGen/X86/h-registers-2.ll
+++ b/llvm/test/CodeGen/X86/h-registers-2.ll
@@ -7,10 +7,10 @@
 define i32 @foo(ptr %x, i32 %y) nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    imull %eax, %eax
 ; CHECK-NEXT:    movzbl %ah, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movb $77, (%ecx,%eax,8)
 ; CHECK-NEXT:    shll $3, %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index d5f2060ca20e3..7a773ec0ffcbb 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -342,13 +342,13 @@ define void @add() strictfp {
 ;
 ; X86-F16C-LABEL: add:
 ; X86-F16C:       # %bb.0:
-; X86-F16C-NEXT:    movzwl a, %eax
+; X86-F16C-NEXT:    movzwl b, %eax
 ; X86-F16C-NEXT:    vmovd %eax, %xmm0
 ; X86-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; X86-F16C-NEXT:    movzwl b, %eax
+; X86-F16C-NEXT:    movzwl a, %eax
 ; X86-F16C-NEXT:    vmovd %eax, %xmm1
 ; X86-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; X86-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; X86-F16C-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; X86-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index b6a4a12eb0fac..72eee72646129 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -25,10 +25,10 @@ define void @test_load_store(ptr %in, ptr %out) #0 {
 ; CHECK-I686-LABEL: test_load_store:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-I686-NEXT:    pinsrw $0, (%ecx), %xmm0
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %ecx
-; CHECK-I686-NEXT:    movw %cx, (%eax)
+; CHECK-I686-NEXT:    movw %ax, (%ecx)
 ; CHECK-I686-NEXT:    retl
   %val = load half, ptr %in
   store half %val, ptr %out
@@ -151,16 +151,14 @@ define void @test_trunc32(float %in, ptr %addr) #0 {
 ;
 ; CHECK-I686-LABEL: test_trunc32:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $8, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    subl $12, %esp
 ; CHECK-I686-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    movd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $8, %esp
-; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    movw %ax, (%ecx)
+; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %val16 = fptrunc float %in to half
   store half %val16, ptr %addr
@@ -189,16 +187,14 @@ define void @test_trunc64(double %in, ptr %addr) #0 {
 ;
 ; CHECK-I686-LABEL: test_trunc64:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $8, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    subl $12, %esp
 ; CHECK-I686-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-I686-NEXT:    movq %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $8, %esp
-; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    movw %ax, (%ecx)
+; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %val16 = fptrunc double %in to half
   store half %val16, ptr %addr
@@ -269,9 +265,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_sitofp_i64:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $24, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    subl $28, %esp
 ; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-I686-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    fildll {{[0-9]+}}(%esp)
@@ -280,9 +274,9 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 {
 ; CHECK-I686-NEXT:    movd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $24, %esp
-; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    movw %ax, (%ecx)
+; CHECK-I686-NEXT:    addl $28, %esp
 ; CHECK-I686-NEXT:    retl
   %r = sitofp i64 %a to half
   store half %r, ptr %p
@@ -327,8 +321,8 @@ define i64 @test_fptoui_i64(ptr %p) #0 {
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    ucomiss %xmm1, %xmm0
 ; CHECK-I686-NEXT:    jae .LBB9_2
 ; CHECK-I686-NEXT:  # %bb.1:
@@ -401,21 +395,19 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_uitofp_i64:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $24, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    subl $28, %esp
 ; CHECK-I686-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-I686-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-I686-NEXT:    shrl $31, %eax
 ; CHECK-I686-NEXT:    fildll {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; CHECK-I686-NEXT:    fstps (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esi)
-; CHECK-I686-NEXT:    addl $24, %esp
-; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-I686-NEXT:    movw %ax, (%ecx)
+; CHECK-I686-NEXT:    addl $28, %esp
 ; CHECK-I686-NEXT:    retl
   %r = uitofp i64 %a to half
   store half %r, ptr %p
@@ -459,27 +451,24 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_extend32_vec4:
 ; CHECK-I686:       # %bb.0:
+; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $88, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, 6(%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, 4(%eax), %xmm0
-; CHECK-I686-NEXT:    pinsrw $0, 2(%eax), %xmm1
-; CHECK-I686-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-I686-NEXT:    subl $52, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    pinsrw $0, 2(%esi), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
+; CHECK-I686-NEXT:    pinsrw $0, 4(%esi), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %edi
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
+; CHECK-I686-NEXT:    movw %di, (%esp)
+; CHECK-I686-NEXT:    pinsrw $0, 6(%esi), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %edi
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movw %di, (%esp)
+; CHECK-I686-NEXT:    pinsrw $0, (%esi), %xmm0
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    movw %si, (%esp)
@@ -497,8 +486,9 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 {
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-I686-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-I686-NEXT:    addl $88, %esp
+; CHECK-I686-NEXT:    addl $52, %esp
 ; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
 ; CHECK-I686-NEXT:    retl
   %a = load <4 x half>, ptr %p, align 8
   %b = fpext <4 x half> %a to <4 x float>
@@ -546,27 +536,24 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 {
 ;
 ; CHECK-I686-LABEL: test_extend64_vec4:
 ; CHECK-I686:       # %bb.0:
+; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $104, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, 6(%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    pinsrw $0, 2(%eax), %xmm0
-; CHECK-I686-NEXT:    pinsrw $0, 4(%eax), %xmm1
-; CHECK-I686-NEXT:    pextrw $0, %xmm1, %eax
+; CHECK-I686-NEXT:    subl $68, %esp
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-I686-NEXT:    pinsrw $0, 4(%esi), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
+; CHECK-I686-NEXT:    pinsrw $0, 2(%esi), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %edi
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
+; CHECK-I686-NEXT:    movw %di, (%esp)
+; CHECK-I686-NEXT:    pinsrw $0, (%esi), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %edi
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movw %si, (%esp)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movw %di, (%esp)
+; CHECK-I686-NEXT:    pinsrw $0, 6(%esi), %xmm0
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    movw %si, (%esp)
@@ -581,8 +568,9 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 {
 ; CHECK-I686-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; CHECK-I686-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-I686-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; CHECK-I686-NEXT:    addl $104, %esp
+; CHECK-I686-NEXT:    addl $68, %esp
 ; CHECK-I686-NEXT:    popl %esi
+; CHECK-I686-NEXT:    popl %edi
 ; CHECK-I686-NEXT:    retl
   %a = load <4 x half>, ptr %p, align 8
   %b = fpext <4 x half> %a to <4 x double>
@@ -632,20 +620,14 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 {
 ; CHECK-I686-LABEL: test_trunc32_vec4:
 ; CHECK-I686:       # %bb.0:
 ; CHECK-I686-NEXT:    pushl %esi
-; CHECK-I686-NEXT:    subl $88, %esp
+; CHECK-I686-NEXT:    subl $72, %esp
 ; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-I686-NEXT:    movaps %xmm0, %xmm1
-; CHECK-I686-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; CHECK-I686-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; CHECK-I686-NEXT:    movss %xmm1, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-I686-NEXT:    movss %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
@@ -653,7 +635,11 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 {
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-I686-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; CHECK-I686-NEXT:    movss %xmm1, (%esp)
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-I686-NEXT:    movw %ax, (%esi)
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
@@ -661,10 +647,10 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 {
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, 4(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, 2(%esi)
-; CHECK-I686-NEXT:    addl $88, %esp
+; CHECK-I686-NEXT:    addl $72, %esp
 ; CHECK-I686-NEXT:    popl %esi
 ; CHECK-I686-NEXT:    retl
   %v = fptrunc <4 x float> %a to <4 x half>
@@ -747,11 +733,6 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; CHECK-I686-NEXT:    subl $88, %esp
 ; CHECK-I686-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
-; CHECK-I686-NEXT:    calll __truncdfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
@@ -762,7 +743,10 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-I686-NEXT:    movlps %xmm1, (%esp)
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-I686-NEXT:    movw %ax, 6(%esi)
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
@@ -770,7 +754,7 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 {
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, 2(%esi)
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esi)
 ; CHECK-I686-NEXT:    addl $88, %esp
@@ -854,18 +838,16 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
 ;
 ; CHECK-I686-LABEL: test_sitofp_fadd_i32:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    subl $60, %esp
-; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    xorps %xmm0, %xmm0
+; CHECK-I686-NEXT:    pushl %esi
+; CHECK-I686-NEXT:    subl $40, %esp
 ; CHECK-I686-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-I686-NEXT:    pinsrw $0, (%eax), %xmm0
+; CHECK-I686-NEXT:    pextrw $0, %xmm0, %esi
 ; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
-; CHECK-I686-NEXT:    movw %ax, (%esp)
+; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    movw %si, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
@@ -880,7 +862,8 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
-; CHECK-I686-NEXT:    addl $60, %esp
+; CHECK-I686-NEXT:    addl $40, %esp
+; CHECK-I686-NEXT:    popl %esi
 ; CHECK-I686-NEXT:    retl
   %tmp0 = load half, ptr %b
   %tmp1 = sitofp i32 %a to half
@@ -924,14 +907,14 @@ define half @PR40273(half) #0 {
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    xorl %eax, %eax
-; CHECK-I686-NEXT:    xorps %xmm1, %xmm1
-; CHECK-I686-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-I686-NEXT:    movl $15360, %ecx # imm = 0x3C00
-; CHECK-I686-NEXT:    cmovnel %ecx, %eax
-; CHECK-I686-NEXT:    cmovpl %ecx, %eax
-; CHECK-I686-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-I686-NEXT:    movl $15360, %eax # imm = 0x3C00
+; CHECK-I686-NEXT:    xorl %ecx, %ecx
+; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-I686-NEXT:    cmovnel %eax, %ecx
+; CHECK-I686-NEXT:    cmovpl %eax, %ecx
+; CHECK-I686-NEXT:    pinsrw $0, %ecx, %xmm0
 ; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %2 = fcmp une half %0, 0xH0000
@@ -976,9 +959,9 @@ define void @brcond(half %0) #0 {
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    xorps %xmm1, %xmm1
-; CHECK-I686-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    ucomiss %xmm0, %xmm1
 ; CHECK-I686-NEXT:    setp %al
 ; CHECK-I686-NEXT:    setne %cl
 ; CHECK-I686-NEXT:    orb %al, %cl
@@ -1078,14 +1061,14 @@ define void @main.158() #0 {
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-I686-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
-; CHECK-I686-NEXT:    xorps %xmm0, %xmm0
+; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-I686-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm1
 ; CHECK-I686-NEXT:    jae .LBB20_2
 ; CHECK-I686-NEXT:  # %bb.1: # %entry
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-I686-NEXT:    movd {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-I686-NEXT:  .LBB20_2: # %entry
-; CHECK-I686-NEXT:    movss %xmm0, (%esp)
+; CHECK-I686-NEXT:    movd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%eax)
@@ -1162,9 +1145,9 @@ define void @main.45() #0 {
 ; CHECK-I686-NEXT:    movw %si, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-I686-NEXT:    movl $32256, %eax # imm = 0x7E00
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-I686-NEXT:    movl $32256, %eax # imm = 0x7E00
 ; CHECK-I686-NEXT:    cmovnpl %esi, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%eax)
 ; CHECK-I686-NEXT:    addl $8, %esp
@@ -1204,9 +1187,9 @@ define half @fcopysign(half %x, half %y) {
 ;
 ; CHECK-I686-LABEL: fcopysign:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm1
 ; CHECK-I686-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; CHECK-I686-NEXT:    por %xmm1, %xmm0
 ; CHECK-I686-NEXT:    retl
@@ -1302,14 +1285,12 @@ define half @pr61271(half %0, half %1) #0 {
 ;
 ; CHECK-I686-LABEL: pr61271:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    subl $44, %esp
-; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; CHECK-I686-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    subl $12, %esp
 ; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
-; CHECK-I686-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -1319,7 +1300,7 @@ define half @pr61271(half %0, half %1) #0 {
 ; CHECK-I686-NEXT:    minss {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
-; CHECK-I686-NEXT:    addl $44, %esp
+; CHECK-I686-NEXT:    addl $12, %esp
 ; CHECK-I686-NEXT:    retl
   %3 = call fast half @llvm.minnum.f16(half %0, half %1)
   ret half %3
@@ -2055,38 +2036,38 @@ define void @pr63114() {
 ;
 ; CHECK-I686-LABEL: pr63114:
 ; CHECK-I686:       # %bb.0:
-; CHECK-I686-NEXT:    movdqu (%eax), %xmm6
-; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7]
-; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
-; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
-; CHECK-I686-NEXT:    pand %xmm1, %xmm0
-; CHECK-I686-NEXT:    movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
-; CHECK-I686-NEXT:    por %xmm2, %xmm0
-; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
-; CHECK-I686-NEXT:    pand %xmm3, %xmm0
-; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60]
-; CHECK-I686-NEXT:    por %xmm4, %xmm0
-; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7]
-; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; CHECK-I686-NEXT:    pand %xmm1, %xmm5
-; CHECK-I686-NEXT:    por %xmm2, %xmm5
-; CHECK-I686-NEXT:    pand %xmm3, %xmm5
-; CHECK-I686-NEXT:    por %xmm4, %xmm5
-; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,5,5,5,5]
-; CHECK-I686-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,3,0,3]
-; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
-; CHECK-I686-NEXT:    pand %xmm1, %xmm6
-; CHECK-I686-NEXT:    por %xmm2, %xmm6
-; CHECK-I686-NEXT:    pand %xmm3, %xmm6
-; CHECK-I686-NEXT:    por %xmm4, %xmm6
-; CHECK-I686-NEXT:    pand %xmm1, %xmm7
-; CHECK-I686-NEXT:    por %xmm2, %xmm7
-; CHECK-I686-NEXT:    pand %xmm3, %xmm7
-; CHECK-I686-NEXT:    por %xmm4, %xmm7
-; CHECK-I686-NEXT:    movdqu %xmm7, 0
-; CHECK-I686-NEXT:    movdqu %xmm6, 32
-; CHECK-I686-NEXT:    movdqu %xmm5, 48
-; CHECK-I686-NEXT:    movdqu %xmm0, 16
+; CHECK-I686-NEXT:    movdqu (%eax), %xmm4
+; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,5,5]
+; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
+; CHECK-I686-NEXT:    pand %xmm0, %xmm5
+; CHECK-I686-NEXT:    movq {{.*#+}} xmm1 = [0,0,0,15360,0,0,0,0]
+; CHECK-I686-NEXT:    por %xmm1, %xmm5
+; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
+; CHECK-I686-NEXT:    pand %xmm2, %xmm5
+; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60]
+; CHECK-I686-NEXT:    por %xmm3, %xmm5
+; CHECK-I686-NEXT:    movdqu %xmm5, 0
+; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,7]
+; CHECK-I686-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[0,1,3,3,4,5,6,7]
+; CHECK-I686-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3,0,3]
+; CHECK-I686-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
+; CHECK-I686-NEXT:    pand %xmm0, %xmm4
+; CHECK-I686-NEXT:    por %xmm1, %xmm4
+; CHECK-I686-NEXT:    pand %xmm2, %xmm4
+; CHECK-I686-NEXT:    por %xmm3, %xmm4
+; CHECK-I686-NEXT:    movdqu %xmm4, 32
+; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; CHECK-I686-NEXT:    pand %xmm0, %xmm4
+; CHECK-I686-NEXT:    por %xmm1, %xmm4
+; CHECK-I686-NEXT:    pand %xmm2, %xmm4
+; CHECK-I686-NEXT:    por %xmm3, %xmm4
+; CHECK-I686-NEXT:    movdqu %xmm4, 48
+; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,1]
+; CHECK-I686-NEXT:    pand %xmm0, %xmm4
+; CHECK-I686-NEXT:    por %xmm1, %xmm4
+; CHECK-I686-NEXT:    pand %xmm2, %xmm4
+; CHECK-I686-NEXT:    por %xmm3, %xmm4
+; CHECK-I686-NEXT:    movdqu %xmm4, 16
 ; CHECK-I686-NEXT:    retl
   %1 = load <24 x half>, ptr poison, align 2
   %2 = shufflevector <24 x half> %1, <24 x half> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
diff --git a/llvm/test/CodeGen/X86/hipe-cc.ll b/llvm/test/CodeGen/X86/hipe-cc.ll
index a7bcd17b23275..b69d0b270f46d 100644
--- a/llvm/test/CodeGen/X86/hipe-cc.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc.ll
@@ -51,8 +51,8 @@ define cc 11 void @foo(i32 %hp, i32 %p, i32 %arg0, i32 %arg1, i32 %arg2) nounwin
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $20, %esp
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ecx, (%esp)
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 63302b41e2076..68998f517a76c 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -497,30 +497,21 @@ define i1 @scalar_i64_bitsinmiddle_eq(i64 %x, i64 %y) nounwind {
 define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-LABEL: vec_4xi32_splat_eq:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    pslld $23, %xmm1
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; AVX2-LABEL: vec_4xi32_splat_eq:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    ret{{[l|q]}}
-;
 ; X64-SSE2-LABEL: vec_4xi32_splat_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm2
@@ -581,30 +572,21 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    pslld $23, %xmm1
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    ret{{[l|q]}}
-;
 ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 05b0a0be2acc8..f55a6faac337e 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -101,9 +101,9 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
 ;
 ; X86-BMI2-LABEL: scalar_i16_signbit_eq:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    testl $32768, %eax # imm = 0x8000
 ; X86-BMI2-NEXT:    sete %al
 ; X86-BMI2-NEXT:    retl
@@ -163,9 +163,9 @@ define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind {
 ;
 ; X86-BMI2-LABEL: scalar_i16_bitsinmiddle_eq:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %eax, %eax
+; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    testl $4080, %eax # imm = 0xFF0
 ; X86-BMI2-NEXT:    sete %al
 ; X86-BMI2-NEXT:    retl
@@ -239,7 +239,7 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    btl %eax, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
@@ -434,11 +434,11 @@ define i1 @scalar_i128_lowestbit_eq(i128 %x, i128 %y) nounwind {
 ; X86-SSE2-LABEL: scalar_i128_lowestbit_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    subl $44, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
-; X86-SSE2-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl %eax, %ecx
 ; X86-SSE2-NEXT:    shrb $3, %cl
 ; X86-SSE2-NEXT:    andb $12, %cl
@@ -452,11 +452,11 @@ define i1 @scalar_i128_lowestbit_eq(i128 %x, i128 %y) nounwind {
 ; X86-BMI2-AVX2-LABEL: scalar_i128_lowestbit_eq:
 ; X86-BMI2-AVX2:       # %bb.0:
 ; X86-BMI2-AVX2-NEXT:    subl $44, %esp
-; X86-BMI2-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-BMI2-AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
-; X86-BMI2-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-BMI2-AVX2-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-BMI2-AVX2-NEXT:    vmovaps %xmm0, (%esp)
+; X86-BMI2-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI2-AVX2-NEXT:    movl %eax, %ecx
 ; X86-BMI2-AVX2-NEXT:    shrb $3, %cl
 ; X86-BMI2-AVX2-NEXT:    andb $12, %cl
@@ -511,14 +511,14 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; AVX2-LABEL: vec_4xi32_splat_eq:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-BMI2-AVX2-LABEL: vec_4xi32_splat_eq:
+; X86-BMI2-AVX2:       # %bb.0:
+; X86-BMI2-AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; X86-BMI2-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-BMI2-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: vec_4xi32_splat_eq:
 ; X64-SSE2:       # %bb.0:
@@ -597,14 +597,14 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-BMI2-AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq:
+; X86-BMI2-AVX2:       # %bb.0:
+; X86-BMI2-AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; X86-BMI2-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-BMI2-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; X86-BMI2-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq:
 ; X64-SSE2:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
index 93049f9987a5e..cde9e69d7ccd0 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -16,10 +16,10 @@
 define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -431,9 +431,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -1039,9 +1039,9 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1051,42 +1051,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
-; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
index 47bb0957f3fbb..73faa5f4b6920 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -16,10 +16,10 @@
 define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -433,9 +433,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
@@ -1043,9 +1043,9 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1055,42 +1055,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index 9946267b48e7f..6032ab2f2cbc4 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -16,10 +16,10 @@
 define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -40,37 +40,37 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v2i64:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v2i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    ## xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_reduce_v2i64:
 ; X86-AVX2:       ## %bb.0:
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    retl
@@ -147,25 +147,25 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 define i32 @test_reduce_v4i32(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
 ; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
-; X86-SSE2-NEXT:    por %xmm1, %xmm3
-; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v4i32:
@@ -321,7 +321,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ;
 ; X64-AVX512-LABEL: test_reduce_v8i16:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    notl %eax
@@ -437,7 +437,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ;
 ; X64-AVX512-LABEL: test_reduce_v16i8:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
@@ -484,9 +484,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -513,9 +513,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
 ; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
@@ -532,11 +532,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -550,11 +550,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -680,24 +680,24 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm3, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm3, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
-; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v8i32:
@@ -938,7 +938,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    notl %eax
@@ -1088,7 +1088,7 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1124,9 +1124,9 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1136,42 +1136,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
-; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
@@ -1193,26 +1193,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm6
 ; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm4
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; X86-SSE42-NEXT:    movapd %xmm3, %xmm1
 ; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
 ; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
@@ -1222,27 +1223,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-AVX1-LABEL: test_reduce_v8i64:
 ; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1260,11 +1261,11 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1442,48 +1443,48 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
 ; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
-; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm1
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i32:
@@ -1779,7 +1780,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    notl %eax
@@ -1950,7 +1951,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
@@ -2073,7 +2074,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
 ;
 ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    notl %eax
@@ -2181,7 +2182,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
 ;
 ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX512-NEXT:    notl %eax
@@ -2301,7 +2302,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ;
 ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
@@ -2426,7 +2427,7 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ;
 ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index 0bbf94f1817f5..e23e4ba4c8080 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -16,10 +16,10 @@
 define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -40,38 +40,38 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v2i64:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm0, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v2i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    ## xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_reduce_v2i64:
 ; X86-AVX2:       ## %bb.0:
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    retl
@@ -149,23 +149,23 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 define i32 @test_reduce_v4i32(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
 ; X86-SSE2-NEXT:    por %xmm2, %xmm1
 ; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    retl
@@ -426,9 +426,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
@@ -456,9 +456,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
 ; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm2
 ; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
@@ -476,11 +476,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -488,17 +488,17 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ;
 ; X86-AVX2-LABEL: test_reduce_v4i64:
 ; X86-AVX2:       ## %bb.0:
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -626,22 +626,22 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
 ; X86-SSE2-NEXT:    por %xmm3, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
 ; X86-SSE2-NEXT:    por %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    retl
@@ -1038,9 +1038,9 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1050,42 +1050,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
@@ -1105,32 +1105,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v8i64:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm5
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm5
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
 ; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
-; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm5
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; X86-SSE42-NEXT:    movd %xmm1, %eax
 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
@@ -1138,27 +1138,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-AVX1-LABEL: test_reduce_v8i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm5, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm5
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1171,16 +1171,16 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
 ; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
 ; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1360,46 +1360,46 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
 ; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
 ; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
 ; X86-SSE2-NEXT:    por %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movd %xmm4, %eax
 ; X86-SSE2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll
index e9bb75446d589..02413b1dae58a 100644
--- a/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll
+++ b/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll
@@ -7,13 +7,6 @@
 ;
 
 define <16 x i8> @permute_packss_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
-; CHECK-LABEL: permute_packss_packss_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpackssdw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
-; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
   %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
   %3 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
@@ -22,13 +15,6 @@ define <16 x i8> @permute_packss_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i
 }
 
 define <16 x i8> @permute_packss_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
-; CHECK-LABEL: permute_packss_packus_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpackusdw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
-; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
   %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
   %3 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
@@ -37,13 +23,6 @@ define <16 x i8> @permute_packss_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i
 }
 
 define <8 x i16> @permute_phadd_phadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
-; CHECK-LABEL: permute_phadd_phadd_128:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vphaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
-; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
   %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
   %3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -56,13 +35,6 @@ define <8 x i16> @permute_phadd_phadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
 ;
 
 define <8 x float> @permute_hadd_hadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
-; CHECK-LABEL: permute_hadd_hadd_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vhaddps %ymm3, %ymm2, %ymm1
-; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
-; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
   %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
   %3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2)
@@ -71,13 +43,6 @@ define <8 x float> @permute_hadd_hadd_256(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 define <16 x i16> @permute_phadd_phadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
-; CHECK-LABEL: permute_phadd_phadd_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vphaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
-; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
   %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
   %3 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
@@ -114,3 +79,5 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll
index 54911351e68dc..31d01d7a6d01d 100644
--- a/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll
+++ b/llvm/test/CodeGen/X86/horizontal-shuffle-demanded.ll
@@ -9,8 +9,8 @@
 define void @test_demanded_haddps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_haddps_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -29,8 +29,8 @@ define void @test_demanded_haddps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2)
 define void @test_demanded_hsubps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_hsubps_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vextractps $2, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -49,8 +49,8 @@ define void @test_demanded_hsubps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2)
 define void @test_demanded_haddpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_haddpd_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlpd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -69,8 +69,8 @@ define void @test_demanded_haddpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2
 define void @test_demanded_hsubpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_hsubpd_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlpd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -89,8 +89,8 @@ define void @test_demanded_hsubpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2
 define void @test_demanded_phaddd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phaddd_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -109,8 +109,8 @@ define void @test_demanded_phaddd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) noun
 define void @test_demanded_phsubd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phsubd_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -129,8 +129,8 @@ define void @test_demanded_phsubd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) noun
 define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phaddw_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -149,8 +149,8 @@ define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) noun
 define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phsubw_128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrw $2, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -173,9 +173,9 @@ define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) noun
 define void @test_demanded_haddps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_haddps_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -197,9 +197,9 @@ define void @test_demanded_haddps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2)
 define void @test_demanded_hsubps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_hsubps_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vextractps $3, %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -221,9 +221,9 @@ define void @test_demanded_hsubps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2)
 define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_haddpd_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlpd %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -245,9 +245,9 @@ define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2
 define void @test_demanded_hsubpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_hsubpd_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovlpd %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -269,9 +269,9 @@ define void @test_demanded_hsubpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2
 define void @test_demanded_phaddd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phaddd_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrd $3, %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -293,9 +293,9 @@ define void @test_demanded_phaddd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) noun
 define void @test_demanded_phsubd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phsubd_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
 ; X86-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrd $1, %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -317,9 +317,9 @@ define void @test_demanded_phsubd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) noun
 define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phaddw_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpbroadcastw %xmm1, %xmm0
 ; X86-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrw $4, %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -341,8 +341,8 @@ define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) no
 define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind {
 ; X86-LABEL: test_demanded_phsubw_256:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrw $6, %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index b4546c1e983c4..bb7d73bb6f4c7 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -10,23 +10,23 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    addl 40(%ebp), %esi
 ; X86-NEXT:    adcl 44(%ebp), %edi
-; X86-NEXT:    adcl 48(%ebp), %ecx
-; X86-NEXT:    adcl 52(%ebp), %edx
+; X86-NEXT:    adcl 48(%ebp), %edx
+; X86-NEXT:    adcl 52(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -53,23 +53,23 @@ define <1 x i128> @add_v1i128(<1 x i128> %x, <1 x i128> %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index 2174d5056e6ce..9e74c4d0b12ba 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -53,42 +53,35 @@ define void @store(PrimTy %x, ptr %p) nounwind {
 ;
 ; CHECK-X86-LABEL: store:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    subl $12, %esp
+; CHECK-X86-NEXT:    movl 28(%esp), %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %ecx
+; CHECK-X86-NEXT:    movl %eax, 12(%ecx)
+; CHECK-X86-NEXT:    movl 24(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 8(%ecx)
 ; CHECK-X86-NEXT:    movl 16(%esp), %eax
-; CHECK-X86-NEXT:    movl 20(%esp), %ecx
-; CHECK-X86-NEXT:    movl 24(%esp), %edx
-; CHECK-X86-NEXT:    movl 28(%esp), %esi
-; CHECK-X86-NEXT:    movl 32(%esp), %edi
-; CHECK-X86-NEXT:    movl %esi, 12(%edi)
-; CHECK-X86-NEXT:    movl %edx, 8(%edi)
-; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
-; CHECK-X86-NEXT:    movl %eax, (%edi)
-; CHECK-X86-NEXT:    addl $4, %esp
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    movl 20(%esp), %edx
+; CHECK-X86-NEXT:    movl %edx, 4(%ecx)
+; CHECK-X86-NEXT:    movl %eax, (%ecx)
+; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $16, %esp
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl %ecx, 12(%edx)
+; CHECK-MSVC32-NEXT:    movl %eax, 8(%edx)
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
 ; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
-; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edi
-; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
-; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
-; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
-; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
-; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edx)
+; CHECK-MSVC32-NEXT:    movl %eax, (%edx)
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
@@ -134,42 +127,35 @@ define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ;
 ; CHECK-X86-LABEL: store_perturbed:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    subl $12, %esp
+; CHECK-X86-NEXT:    movl 44(%esp), %eax
+; CHECK-X86-NEXT:    movl 48(%esp), %ecx
+; CHECK-X86-NEXT:    movl %eax, 12(%ecx)
+; CHECK-X86-NEXT:    movl 40(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 8(%ecx)
 ; CHECK-X86-NEXT:    movl 32(%esp), %eax
-; CHECK-X86-NEXT:    movl 36(%esp), %ecx
-; CHECK-X86-NEXT:    movl 40(%esp), %edx
-; CHECK-X86-NEXT:    movl 44(%esp), %esi
-; CHECK-X86-NEXT:    movl 48(%esp), %edi
-; CHECK-X86-NEXT:    movl %esi, 12(%edi)
-; CHECK-X86-NEXT:    movl %edx, 8(%edi)
-; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
-; CHECK-X86-NEXT:    movl %eax, (%edi)
-; CHECK-X86-NEXT:    addl $4, %esp
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    movl 36(%esp), %edx
+; CHECK-X86-NEXT:    movl %edx, 4(%ecx)
+; CHECK-X86-NEXT:    movl %eax, (%ecx)
+; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store_perturbed:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $16, %esp
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl %ecx, 12(%edx)
+; CHECK-MSVC32-NEXT:    movl %eax, 8(%edx)
 ; CHECK-MSVC32-NEXT:    movl 24(%ebp), %eax
 ; CHECK-MSVC32-NEXT:    movl 28(%ebp), %ecx
-; CHECK-MSVC32-NEXT:    movl 32(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 36(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edi
-; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
-; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
-; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
-; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
-; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edx)
+; CHECK-MSVC32-NEXT:    movl %eax, (%edx)
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
@@ -212,38 +198,30 @@ define PrimTy @return(ptr %p) nounwind {
 ;
 ; CHECK-X86-LABEL: return:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
+; CHECK-X86-NEXT:    movl 4(%esp), %eax
+; CHECK-X86-NEXT:    movl 8(%esp), %ecx
+; CHECK-X86-NEXT:    movl 12(%ecx), %edx
+; CHECK-X86-NEXT:    movl %edx, 12(%eax)
+; CHECK-X86-NEXT:    movl 8(%ecx), %edx
+; CHECK-X86-NEXT:    movl %edx, 8(%eax)
 ; CHECK-X86-NEXT:    movl (%ecx), %edx
-; CHECK-X86-NEXT:    movl 4(%ecx), %esi
-; CHECK-X86-NEXT:    movl 8(%ecx), %edi
-; CHECK-X86-NEXT:    movl 12(%ecx), %ecx
-; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
-; CHECK-X86-NEXT:    movl %edi, 8(%eax)
-; CHECK-X86-NEXT:    movl %esi, 4(%eax)
+; CHECK-X86-NEXT:    movl 4(%ecx), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %edx, (%eax)
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: return:
 ; CHECK-MSVC32:       # %bb.0:
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl 4(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl 8(%esp), %ecx
+; CHECK-MSVC32-NEXT:    movl 12(%ecx), %edx
+; CHECK-MSVC32-NEXT:    movl %edx, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl 8(%ecx), %edx
+; CHECK-MSVC32-NEXT:    movl %edx, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl (%ecx), %edx
-; CHECK-MSVC32-NEXT:    movl 4(%ecx), %esi
-; CHECK-MSVC32-NEXT:    movl 8(%ecx), %edi
-; CHECK-MSVC32-NEXT:    movl 12(%ecx), %ecx
-; CHECK-MSVC32-NEXT:    movl %ecx, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %edi, 8(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 4(%eax)
+; CHECK-MSVC32-NEXT:    movl 4(%ecx), %ecx
+; CHECK-MSVC32-NEXT:    movl %ecx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, (%eax)
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
 ; CHECK-MSVC32-NEXT:    retl
   %r = load PrimTy, ptr %p, align 16
   ret PrimTy %r
@@ -282,42 +260,35 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 44(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
+; CHECK-X86-NEXT:    movl 40(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-X86-NEXT:    movl 32(%esp), %ecx
 ; CHECK-X86-NEXT:    movl 36(%esp), %edx
-; CHECK-X86-NEXT:    movl 40(%esp), %esi
-; CHECK-X86-NEXT:    movl 44(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    addl $4, %esp
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $16, %esp
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl %edx, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl 24(%ebp), %ecx
 ; CHECK-MSVC32-NEXT:    movl 28(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 32(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
@@ -362,42 +333,35 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw
 ;
 ; CHECK-X86-LABEL: leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 76(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
+; CHECK-X86-NEXT:    movl 72(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-X86-NEXT:    movl 64(%esp), %ecx
 ; CHECK-X86-NEXT:    movl 68(%esp), %edx
-; CHECK-X86-NEXT:    movl 72(%esp), %esi
-; CHECK-X86-NEXT:    movl 76(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    addl $4, %esp
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $16, %esp
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl %edx, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
 ; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
@@ -442,42 +406,35 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr
 ;
 ; CHECK-X86-LABEL: many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 92(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
+; CHECK-X86-NEXT:    movl 88(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-X86-NEXT:    movl 80(%esp), %ecx
 ; CHECK-X86-NEXT:    movl 84(%esp), %edx
-; CHECK-X86-NEXT:    movl 88(%esp), %esi
-; CHECK-X86-NEXT:    movl 92(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    addl $4, %esp
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $16, %esp
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 80(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl %edx, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl 72(%ebp), %ecx
 ; CHECK-MSVC32-NEXT:    movl 76(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 80(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
@@ -520,42 +477,35 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy
 ;
 ; CHECK-X86-LABEL: trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %edi
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 76(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 12(%eax)
+; CHECK-X86-NEXT:    movl 72(%esp), %ecx
+; CHECK-X86-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-X86-NEXT:    movl 64(%esp), %ecx
 ; CHECK-X86-NEXT:    movl 68(%esp), %edx
-; CHECK-X86-NEXT:    movl 72(%esp), %esi
-; CHECK-X86-NEXT:    movl 76(%esp), %edi
-; CHECK-X86-NEXT:    movl %edi, 12(%eax)
-; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
-; CHECK-X86-NEXT:    addl $4, %esp
-; CHECK-X86-NEXT:    popl %esi
-; CHECK-X86-NEXT:    popl %edi
+; CHECK-X86-NEXT:    addl $12, %esp
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %edi
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    subl $16, %esp
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl %edx, 12(%eax)
+; CHECK-MSVC32-NEXT:    movl %ecx, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
 ; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
-; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
-; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
-; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
-; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
@@ -612,43 +562,39 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    subl $56, %esp
+; CHECK-X86-NEXT:    subl $60, %esp
+; CHECK-X86-NEXT:    movl 76(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 28(%esp)
+; CHECK-X86-NEXT:    movl 72(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 24(%esp)
+; CHECK-X86-NEXT:    movl 68(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 20(%esp)
 ; CHECK-X86-NEXT:    movl 64(%esp), %eax
-; CHECK-X86-NEXT:    movl 68(%esp), %ecx
-; CHECK-X86-NEXT:    movl 72(%esp), %edx
-; CHECK-X86-NEXT:    movl 76(%esp), %esi
-; CHECK-X86-NEXT:    movl %esi, 28(%esp)
-; CHECK-X86-NEXT:    movl %edx, 24(%esp)
-; CHECK-X86-NEXT:    movl %ecx, 20(%esp)
 ; CHECK-X86-NEXT:    movl %eax, 16(%esp)
 ; CHECK-X86-NEXT:    leal 32(%esp), %eax
 ; CHECK-X86-NEXT:    movl %eax, (%esp)
 ; CHECK-X86-NEXT:    calll first_arg at PLT
-; CHECK-X86-NEXT:    addl $52, %esp
-; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    addl $56, %esp
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
 ; CHECK-MSVC32-NEXT:    subl $64, %esp
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 20(%esp)
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
-; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
-; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl %esi, 28(%esp)
-; CHECK-MSVC32-NEXT:    movl %edx, 24(%esp)
-; CHECK-MSVC32-NEXT:    movl %ecx, 20(%esp)
 ; CHECK-MSVC32-NEXT:    movl %eax, 16(%esp)
 ; CHECK-MSVC32-NEXT:    leal 32(%esp), %eax
 ; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
 ; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @first_arg(PrimTy %x)
@@ -738,15 +684,14 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    subl $92, %esp
+; CHECK-X86-NEXT:    movl 108(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 60(%esp)
+; CHECK-X86-NEXT:    movl 104(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 56(%esp)
+; CHECK-X86-NEXT:    movl 100(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 52(%esp)
 ; CHECK-X86-NEXT:    movl 96(%esp), %eax
-; CHECK-X86-NEXT:    movl 100(%esp), %ecx
-; CHECK-X86-NEXT:    movl 104(%esp), %edx
-; CHECK-X86-NEXT:    movl 108(%esp), %esi
-; CHECK-X86-NEXT:    movl %esi, 60(%esp)
-; CHECK-X86-NEXT:    movl %edx, 56(%esp)
-; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
 ; CHECK-X86-NEXT:    movl %eax, 48(%esp)
 ; CHECK-X86-NEXT:    leal 64(%esp), %eax
 ; CHECK-X86-NEXT:    movl %eax, (%esp)
@@ -759,24 +704,22 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ; CHECK-X86-NEXT:    movl $0, 8(%esp)
 ; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll leading_args at PLT
-; CHECK-X86-NEXT:    addl $84, %esp
-; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    addl $88, %esp
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
 ; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 52(%esp)
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
-; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
-; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
-; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
-; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
 ; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
 ; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
 ; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
@@ -789,8 +732,7 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
 ; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
@@ -899,15 +841,14 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    subl $104, %esp
+; CHECK-X86-NEXT:    subl $108, %esp
+; CHECK-X86-NEXT:    movl 124(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 76(%esp)
+; CHECK-X86-NEXT:    movl 120(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 72(%esp)
+; CHECK-X86-NEXT:    movl 116(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 68(%esp)
 ; CHECK-X86-NEXT:    movl 112(%esp), %eax
-; CHECK-X86-NEXT:    movl 116(%esp), %ecx
-; CHECK-X86-NEXT:    movl 120(%esp), %edx
-; CHECK-X86-NEXT:    movl 124(%esp), %esi
-; CHECK-X86-NEXT:    movl %esi, 76(%esp)
-; CHECK-X86-NEXT:    movl %edx, 72(%esp)
-; CHECK-X86-NEXT:    movl %ecx, 68(%esp)
 ; CHECK-X86-NEXT:    movl %eax, 64(%esp)
 ; CHECK-X86-NEXT:    leal 80(%esp), %eax
 ; CHECK-X86-NEXT:    movl %eax, (%esp)
@@ -924,24 +865,22 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ; CHECK-X86-NEXT:    movl $0, 8(%esp)
 ; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll many_leading_args at PLT
-; CHECK-X86-NEXT:    addl $100, %esp
-; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    addl $104, %esp
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
 ; CHECK-MSVC32-NEXT:    subl $112, %esp
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 76(%esp)
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 72(%esp)
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 68(%esp)
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
-; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
-; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl %esi, 76(%esp)
-; CHECK-MSVC32-NEXT:    movl %edx, 72(%esp)
-; CHECK-MSVC32-NEXT:    movl %ecx, 68(%esp)
 ; CHECK-MSVC32-NEXT:    movl %eax, 64(%esp)
 ; CHECK-MSVC32-NEXT:    leal 80(%esp), %eax
 ; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
@@ -958,8 +897,7 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
 ; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
@@ -1049,15 +987,14 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    subl $92, %esp
+; CHECK-X86-NEXT:    movl 108(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 60(%esp)
+; CHECK-X86-NEXT:    movl 104(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 56(%esp)
+; CHECK-X86-NEXT:    movl 100(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, 52(%esp)
 ; CHECK-X86-NEXT:    movl 96(%esp), %eax
-; CHECK-X86-NEXT:    movl 100(%esp), %ecx
-; CHECK-X86-NEXT:    movl 104(%esp), %edx
-; CHECK-X86-NEXT:    movl 108(%esp), %esi
-; CHECK-X86-NEXT:    movl %esi, 60(%esp)
-; CHECK-X86-NEXT:    movl %edx, 56(%esp)
-; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
 ; CHECK-X86-NEXT:    movl %eax, 48(%esp)
 ; CHECK-X86-NEXT:    leal 64(%esp), %eax
 ; CHECK-X86-NEXT:    movl %eax, (%esp)
@@ -1070,24 +1007,22 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ; CHECK-X86-NEXT:    movl $0, 8(%esp)
 ; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll trailing_arg at PLT
-; CHECK-X86-NEXT:    addl $84, %esp
-; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    addl $88, %esp
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
-; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
 ; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, 52(%esp)
 ; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
-; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
-; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
-; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
-; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
-; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
-; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
 ; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
 ; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
 ; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
@@ -1100,8 +1035,7 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
 ; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
-; CHECK-MSVC32-NEXT:    popl %esi
+; CHECK-MSVC32-NEXT:    movl %ebp, %esp
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index cffd88c55bb0a..29e42fbdbf144 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -152,13 +152,13 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X86-NOBMI-NEXT:    movl %ecx, %ebx
 ; X86-NOBMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOBMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NOBMI-NEXT:    adcl $0, %eax
-; X86-NOBMI-NEXT:    adcl $0, %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl %ecx, (%esi,%ebx,8)
 ; X86-NOBMI-NEXT:    movl %ebx, %ecx
+; X86-NOBMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NOBMI-NEXT:    movl %edi, 4(%esi,%ebx,8)
+; X86-NOBMI-NEXT:    adcl $0, %eax
+; X86-NOBMI-NEXT:    adcl $0, %edx
 ; X86-NOBMI-NEXT:    addl $1, %ecx
 ; X86-NOBMI-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NOBMI-NEXT:    adcl $0, %edi
@@ -198,52 +198,49 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind {
 ; X86-BMI-NEXT:  .LBB1_2: # %for.body
 ; X86-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-BMI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-BMI-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-BMI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    movl (%eax,%ebx,8), %ecx
 ; X86-BMI-NEXT:    movl 4(%eax,%ebx,8), %esi
-; X86-BMI-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %ecx, %edx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI-NEXT:    mulxl %eax, %edx, %edi
 ; X86-BMI-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-BMI-NEXT:    movl %esi, %edx
-; X86-BMI-NEXT:    mulxl %eax, %esi, %eax
-; X86-BMI-NEXT:    addl %edi, %esi
-; X86-BMI-NEXT:    adcl $0, %eax
+; X86-BMI-NEXT:    mulxl %eax, %eax, %esi
+; X86-BMI-NEXT:    addl %edi, %eax
+; X86-BMI-NEXT:    adcl $0, %esi
 ; X86-BMI-NEXT:    movl %ecx, %edx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI-NEXT:    mulxl %ecx, %edi, %ebp
-; X86-BMI-NEXT:    addl %esi, %edi
-; X86-BMI-NEXT:    adcl %eax, %ebp
-; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-BMI-NEXT:    mulxl %ecx, %ecx, %eax
+; X86-BMI-NEXT:    addl %eax, %edi
+; X86-BMI-NEXT:    adcl %esi, %ebp
+; X86-BMI-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-BMI-NEXT:    mulxl %ecx, %ecx, %esi
 ; X86-BMI-NEXT:    setb %dl
 ; X86-BMI-NEXT:    addl %ebp, %ecx
 ; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI-NEXT:    movzbl %dl, %edx
-; X86-BMI-NEXT:    adcl %edx, %eax
-; X86-BMI-NEXT:    movl %eax, %edx
+; X86-BMI-NEXT:    adcl %edx, %esi
 ; X86-BMI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-BMI-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; X86-BMI-NEXT:    adcl $0, %ecx
-; X86-BMI-NEXT:    adcl $0, %edx
-; X86-BMI-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl %eax, (%edx,%ebx,8)
+; X86-BMI-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-BMI-NEXT:    movl %edi, 4(%edx,%ebx,8)
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT:    adcl $0, %ecx
+; X86-BMI-NEXT:    movl %esi, %eax
+; X86-BMI-NEXT:    adcl $0, %eax
 ; X86-BMI-NEXT:    addl $1, %ebx
 ; X86-BMI-NEXT:    adcl $0, %ebp
 ; X86-BMI-NEXT:    movl %ebx, %edx
-; X86-BMI-NEXT:    xorl %esi, %edx
+; X86-BMI-NEXT:    xorl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    movl %ebp, %esi
 ; X86-BMI-NEXT:    xorl %edi, %esi
 ; X86-BMI-NEXT:    orl %edx, %esi
-; X86-BMI-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-BMI-NEXT:    jne .LBB1_2
 ; X86-BMI-NEXT:  .LBB1_3: # %for.end
 ; X86-BMI-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index f78e34ef60569..98db09c31686f 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -10,31 +10,30 @@ define i128 @test1(i128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shrl $30, %edx
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl 28(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl 28(%ebp), %eax
 ; X86-NEXT:    adcl 32(%ebp), %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    shrdl $2, %ecx, %edx
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $2, %esi
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    shrdl $2, %ecx, %edx
+; X86-NEXT:    sarl $2, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -59,41 +58,38 @@ define i128 @test2(i128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shrl $30, %ecx
+; X86-NEXT:    shrl $30, %edx
 ; X86-NEXT:    movl 24(%ebp), %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    adcl 28(%ebp), %edx
-; X86-NEXT:    adcl 32(%ebp), %ecx
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    shrdl $2, %eax, %ecx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl 28(%ebp), %eax
+; X86-NEXT:    adcl 32(%ebp), %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    shrdl $2, %ecx, %edx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    sarl $2, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sarl $2, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -118,8 +114,322 @@ define i128 @test2(i128 %x) nounwind {
 
 define i128 @test3(i128 %x) nounwind {
 ; X86-LABEL: test3:
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
-; X86: udiv-do-while
+; X86:       # %bb.0: # %_udiv-special-cases
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    xorl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    jne .LBB2_1
+; X86-NEXT:  # %bb.2: # %_udiv-special-cases
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    jmp .LBB2_3
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:  .LBB2_3: # %_udiv-special-cases
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB2_4
+; X86-NEXT:  # %bb.5: # %_udiv-special-cases
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB2_6
+; X86-NEXT:  .LBB2_4:
+; X86-NEXT:    bsrl %ecx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB2_6: # %_udiv-special-cases
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    jne .LBB2_8
+; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:  .LBB2_8: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $61, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    je .LBB2_9
+; X86-NEXT:  # %bb.10: # %select.false.sink
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB2_11: # %select.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    jne .LBB2_13
+; X86-NEXT:  # %bb.12: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:  .LBB2_13: # %select.end
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    notl %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB2_14
+; X86-NEXT:  # %bb.20: # %select.end
+; X86-NEXT:    xorl $127, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    je .LBB2_21
+; X86-NEXT:  # %bb.18: # %udiv-bb1
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 136(%esp,%eax), %esi
+; X86-NEXT:    movl 140(%esp,%eax), %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    movl 132(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB2_19
+; X86-NEXT:  # %bb.15: # %udiv-preheader
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %esi
+; X86-NEXT:    movl 84(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $3, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB2_16: # %udiv-do-while
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl $3, %ecx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    leal (%edx,%edx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $31, %ecx, %edx
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    jne .LBB2_16
+; X86-NEXT:  .LBB2_17: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    leal (%edx,%eax,2), %ebx
+; X86-NEXT:    shrdl $31, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:  .LBB2_21: # %udiv-end
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    xorl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB2_9:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    jmp .LBB2_11
+; X86-NEXT:  .LBB2_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB2_17
+; X86-NEXT:  .LBB2_14:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jmp .LBB2_21
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
@@ -129,6 +439,7 @@ define i128 @test3(i128 %x) nounwind {
 ; X64-NEXT:    callq __divti3 at PLT
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    retq
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
   %tmp = sdiv i128 %x, -73786976294838206467
   ret i128 %tmp
 }
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index e347c5944c2b9..326d762364e90 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -9,11 +9,11 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
@@ -44,95 +44,88 @@ define i128 @test2(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $144, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB1_1
 ; X86-NEXT:  # %bb.2: # %_udiv-special-cases
-; X86-NEXT:    bsrl %esi, %ebx
+; X86-NEXT:    bsrl %ecx, %ebx
 ; X86-NEXT:    xorl $31, %ebx
 ; X86-NEXT:    orl $32, %ebx
 ; X86-NEXT:    jmp .LBB1_3
 ; X86-NEXT:  .LBB1_1:
-; X86-NEXT:    bsrl %edi, %ebx
+; X86-NEXT:    bsrl %edx, %ebx
 ; X86-NEXT:    xorl $31, %ebx
 ; X86-NEXT:  .LBB1_3: # %_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %edx
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB1_4
 ; X86-NEXT:  # %bb.5: # %_udiv-special-cases
-; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    bsrl 24(%ebp), %eax
 ; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    orl $32, %eax
-; X86-NEXT:    jmp .LBB1_6
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    je .LBB1_7
+; X86-NEXT:    jmp .LBB1_8
 ; X86-NEXT:  .LBB1_4:
-; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    bsrl %eax, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:  .LBB1_6: # %_udiv-special-cases
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    jne .LBB1_8
-; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:  .LBB1_7: # %_udiv-special-cases
 ; X86-NEXT:    orl $64, %eax
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:  .LBB1_8: # %_udiv-special-cases
 ; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    je .LBB1_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB1_11: # %select.end
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    jne .LBB1_13
 ; X86-NEXT:  # %bb.12: # %select.end
 ; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB1_13: # %select.end
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    jne .LBB1_14
-; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    jne .LBB1_19
+; X86-NEXT:  # %bb.14: # %select.end
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    xorl $127, %edx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    je .LBB1_21
-; X86-NEXT:  # %bb.18: # %udiv-bb1
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB1_19
+; X86-NEXT:  # %bb.15: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 28(%ebp), %eax
@@ -145,32 +138,34 @@ define i128 @test2(i128 %x) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 120(%esp,%eax), %edx
-; X86-NEXT:    movl 124(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%esp,%eax), %edi
-; X86-NEXT:    movl 116(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl 120(%esp,%edx), %edi
+; X86-NEXT:    movl 124(%esp,%edx), %eax
 ; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 112(%esp,%edx), %esi
+; X86-NEXT:    movl 116(%esp,%edx), %eax
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    addl $1, %ecx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jb .LBB1_19
-; X86-NEXT:  # %bb.15: # %udiv-preheader
+; X86-NEXT:    jb .LBB1_20
+; X86-NEXT:  # %bb.16: # %udiv-preheader
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 28(%ebp), %eax
@@ -183,121 +178,120 @@ define i128 @test2(i128 %x) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    movl 76(%esp,%esi), %eax
-; X86-NEXT:    movl 72(%esp,%esi), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    andb $12, %cl
+; X86-NEXT:    movzbl %cl, %edi
+; X86-NEXT:    movl 76(%esp,%edi), %esi
+; X86-NEXT:    movl 72(%esp,%edi), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%esp,%esi), %edx
-; X86-NEXT:    movl 68(%esp,%esi), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%esp,%edi), %edx
+; X86-NEXT:    movl 68(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $-4, %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB1_16: # %udiv-do-while
+; X86-NEXT:  .LBB1_17: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    andl %edi, %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl $-4, %edx
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    subl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %ebx
+; X86-NEXT:    addl $-1, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    leal (%edx,%edx), %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %edx
+; X86-NEXT:    shrdl $31, %esi, %eax
+; X86-NEXT:    shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    movl $-4, %eax
-; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl $-1, %esi
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    jne .LBB1_16
-; X86-NEXT:  .LBB1_17: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    jne .LBB1_17
+; X86-NEXT:  .LBB1_18: # %udiv-loop-exit
+; X86-NEXT:    leal (%ebx,%edi,2), %ecx
+; X86-NEXT:    shrdl $31, %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%eax,2), %esi
+; X86-NEXT:    shrdl $31, %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:  .LBB1_19: # %udiv-end
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:  .LBB1_21: # %udiv-end
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
@@ -307,16 +301,12 @@ define i128 @test2(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB1_9:
-; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
 ; X86-NEXT:    jmp .LBB1_11
-; X86-NEXT:  .LBB1_19:
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jmp .LBB1_17
-; X86-NEXT:  .LBB1_14:
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:  .LBB1_20:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    jmp .LBB1_21
+; X86-NEXT:    jmp .LBB1_18
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0:
@@ -341,70 +331,67 @@ define i128 @test3(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    movl 36(%ebp), %edx
-; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB2_1
 ; X86-NEXT:  # %bb.2: # %_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %ebx
-; X86-NEXT:    xorl $31, %ebx
-; X86-NEXT:    orl $32, %ebx
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB2_3
 ; X86-NEXT:  .LBB2_1:
-; X86-NEXT:    bsrl %edx, %ebx
-; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB2_3: # %_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB2_4
 ; X86-NEXT:  # %bb.5: # %_udiv-special-cases
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB2_6
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB2_7
+; X86-NEXT:    jmp .LBB2_8
 ; X86-NEXT:  .LBB2_4:
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB2_6: # %_udiv-special-cases
 ; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB2_8
-; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:  .LBB2_7: # %_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB2_8: # %_udiv-special-cases
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    orl 32(%ebp), %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    je .LBB2_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    movl $127, %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $127, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:  .LBB2_11: # %select.end
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jne .LBB2_13
 ; X86-NEXT:  # %bb.12: # %select.end
 ; X86-NEXT:    movl 28(%ebp), %ecx
@@ -412,63 +399,65 @@ define i128 @test3(i128 %x) nounwind {
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:  .LBB2_13: # %select.end
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:  .LBB2_13: # %select.end
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB2_14
 ; X86-NEXT:  # %bb.20: # %select.end
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    je .LBB2_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorb $127, %cl
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movl 140(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %edi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
 ; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    shldl %cl, %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    shll %cl, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB2_19
 ; X86-NEXT:  # %bb.15: # %udiv-preheader
-; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 28(%ebp), %eax
@@ -481,133 +470,137 @@ define i128 @test3(i128 %x) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    andb $12, %cl
+; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    movl 92(%esp,%edx), %edi
-; X86-NEXT:    movl 88(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrdl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%edx), %eax
+; X86-NEXT:    movl 88(%esp,%edx), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edx), %esi
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl 84(%esp,%edx), %ebx
 ; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-3, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-5, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    adcl $-1, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $-3, %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $-5, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB2_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
-; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %edx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    movl $-5, %edx
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    movl $-3, %edx
 ; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl $-3, %ecx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl $-1, %esi
-; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    andl $1, %edi
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shrdl $31, %esi, %ecx
+; X86-NEXT:    shrdl $31, %edx, %esi
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    jne .LBB2_16
 ; X86-NEXT:  .LBB2_17: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %ebx
-; X86-NEXT:    shldl $1, %esi, %ecx
-; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    shrdl $31, %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    leal (%eax,%edx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrdl $31, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %ecx
 ; X86-NEXT:  .LBB2_21: # %udiv-end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -615,18 +608,18 @@ define i128 @test3(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB2_9:
-; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movb $1, %cl
 ; X86-NEXT:    jmp .LBB2_11
 ; X86-NEXT:  .LBB2_19:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    jmp .LBB2_17
 ; X86-NEXT:  .LBB2_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jmp .LBB2_21
 ;
 ; X64-LABEL: test3:
@@ -651,95 +644,91 @@ define i128 @div_by_7(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB3_1
 ; X86-NEXT:  # %bb.2: # %entry_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB3_3
 ; X86-NEXT:  .LBB3_1:
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB3_3: # %entry_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB3_4
 ; X86-NEXT:  # %bb.5: # %entry_udiv-special-cases
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB3_6
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    je .LBB3_7
+; X86-NEXT:    jmp .LBB3_8
 ; X86-NEXT:  .LBB3_4:
-; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB3_6: # %entry_udiv-special-cases
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB3_8
-; X86-NEXT:  # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT:  .LBB3_7: # %entry_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB3_8: # %entry_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $125, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB3_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB3_11: # %select.end
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB3_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB3_13: # %select.end
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jne .LBB3_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB3_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %eax
@@ -748,12 +737,13 @@ define i128 @div_by_7(i128 %x) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
@@ -761,21 +751,22 @@ define i128 @div_by_7(i128 %x) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 136(%esp,%eax), %edi
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %esi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB3_19
@@ -796,122 +787,127 @@ define i128 @div_by_7(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    movl 92(%esp,%edi), %eax
-; X86-NEXT:    movl 88(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NEXT:    movl 84(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 92(%esp,%edx), %eax
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edx), %edi
+; X86-NEXT:    movl 84(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $7, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB3_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl $7, %ecx
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    shrdl $31, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    jne .LBB3_16
 ; X86-NEXT:  .LBB3_17: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    leal (%eax,%edi,2), %ebx
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%eax,2), %edi
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shldl $1, %eax, %esi
 ; X86-NEXT:  .LBB3_21: # %udiv-end
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -919,17 +915,15 @@ define i128 @div_by_7(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB3_9:
-; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
 ; X86-NEXT:    jmp .LBB3_11
 ; X86-NEXT:  .LBB3_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB3_17
 ; X86-NEXT:  .LBB3_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB3_21
 ;
 ; X64-LABEL: div_by_7:
@@ -977,95 +971,91 @@ define i128 @div_by_11(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2: # %_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB4_3
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB4_3: # %_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5: # %_udiv-special-cases
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB4_6
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    je .LBB4_7
+; X86-NEXT:    jmp .LBB4_8
 ; X86-NEXT:  .LBB4_4:
-; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB4_6: # %_udiv-special-cases
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB4_8
-; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:  .LBB4_7: # %_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB4_8: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $124, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB4_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB4_11: # %select.end
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB4_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB4_13: # %select.end
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB4_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %eax
@@ -1074,12 +1064,13 @@ define i128 @div_by_11(i128 %x) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
@@ -1087,21 +1078,22 @@ define i128 @div_by_11(i128 %x) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 136(%esp,%eax), %edi
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %esi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB4_19
@@ -1122,122 +1114,127 @@ define i128 @div_by_11(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    movl 92(%esp,%edi), %eax
-; X86-NEXT:    movl 88(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NEXT:    movl 84(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 92(%esp,%edx), %eax
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edx), %edi
+; X86-NEXT:    movl 84(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $11, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $11, %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB4_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl $11, %ecx
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $11, %eax
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    shrdl $31, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    jne .LBB4_16
 ; X86-NEXT:  .LBB4_17: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    leal (%eax,%edi,2), %ebx
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%eax,2), %edi
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shldl $1, %eax, %esi
 ; X86-NEXT:  .LBB4_21: # %udiv-end
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1245,17 +1242,15 @@ define i128 @div_by_11(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB4_9:
-; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
 ; X86-NEXT:    jmp .LBB4_11
 ; X86-NEXT:  .LBB4_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB4_17
 ; X86-NEXT:  .LBB4_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_21
 ;
 ; X64-LABEL: div_by_11:
@@ -1301,95 +1296,91 @@ define i128 @div_by_22(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2: # %entry_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB5_3
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB5_3: # %entry_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB5_4
 ; X86-NEXT:  # %bb.5: # %entry_udiv-special-cases
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB5_6
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    je .LBB5_7
+; X86-NEXT:    jmp .LBB5_8
 ; X86-NEXT:  .LBB5_4:
-; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB5_6: # %entry_udiv-special-cases
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB5_8
-; X86-NEXT:  # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT:  .LBB5_7: # %entry_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB5_8: # %entry_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $123, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB5_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB5_11: # %select.end
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB5_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB5_13: # %select.end
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jne .LBB5_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB5_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %eax
@@ -1398,12 +1389,13 @@ define i128 @div_by_22(i128 %x) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
@@ -1411,21 +1403,22 @@ define i128 @div_by_22(i128 %x) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 136(%esp,%eax), %edi
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %esi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB5_19
@@ -1446,122 +1439,127 @@ define i128 @div_by_22(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    movl 92(%esp,%edi), %eax
-; X86-NEXT:    movl 88(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NEXT:    movl 84(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 92(%esp,%edx), %eax
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edx), %edi
+; X86-NEXT:    movl 84(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $22, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $22, %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB5_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl $22, %ecx
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $22, %eax
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    shrdl $31, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    jne .LBB5_16
 ; X86-NEXT:  .LBB5_17: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    leal (%eax,%edi,2), %ebx
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%eax,2), %edi
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shldl $1, %eax, %esi
 ; X86-NEXT:  .LBB5_21: # %udiv-end
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1569,17 +1567,15 @@ define i128 @div_by_22(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB5_9:
-; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
 ; X86-NEXT:    jmp .LBB5_11
 ; X86-NEXT:  .LBB5_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB5_17
 ; X86-NEXT:  .LBB5_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB5_21
 ;
 ; X64-LABEL: div_by_22:
@@ -1627,95 +1623,91 @@ define i128 @div_by_56(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2: # %_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB6_3
 ; X86-NEXT:  .LBB6_1:
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB6_3: # %_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB6_4
 ; X86-NEXT:  # %bb.5: # %_udiv-special-cases
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB6_6
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    je .LBB6_7
+; X86-NEXT:    jmp .LBB6_8
 ; X86-NEXT:  .LBB6_4:
-; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB6_6: # %_udiv-special-cases
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB6_8
-; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:  .LBB6_7: # %_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB6_8: # %_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $122, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB6_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB6_11: # %select.end
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB6_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB6_13: # %select.end
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jne .LBB6_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB6_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %eax
@@ -1724,12 +1716,13 @@ define i128 @div_by_56(i128 %x) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
@@ -1737,21 +1730,22 @@ define i128 @div_by_56(i128 %x) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 136(%esp,%eax), %edi
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %esi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB6_19
@@ -1772,122 +1766,127 @@ define i128 @div_by_56(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    movl 92(%esp,%edi), %eax
-; X86-NEXT:    movl 88(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NEXT:    movl 84(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 92(%esp,%edx), %eax
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edx), %edi
+; X86-NEXT:    movl 84(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $56, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $56, %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB6_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl $56, %ecx
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $56, %eax
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    shrdl $31, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    jne .LBB6_16
 ; X86-NEXT:  .LBB6_17: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    leal (%eax,%edi,2), %ebx
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%eax,2), %edi
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shldl $1, %eax, %esi
 ; X86-NEXT:  .LBB6_21: # %udiv-end
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1895,17 +1894,15 @@ define i128 @div_by_56(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB6_9:
-; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
 ; X86-NEXT:    jmp .LBB6_11
 ; X86-NEXT:  .LBB6_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB6_17
 ; X86-NEXT:  .LBB6_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB6_21
 ;
 ; X64-LABEL: div_by_56:
@@ -1953,110 +1950,107 @@ define i128 @rem_by_7(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB7_1
 ; X86-NEXT:  # %bb.2: # %_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB7_3
 ; X86-NEXT:  .LBB7_1:
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB7_3: # %_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB7_4
 ; X86-NEXT:  # %bb.5: # %_udiv-special-cases
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB7_6
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    je .LBB7_7
+; X86-NEXT:    jmp .LBB7_8
 ; X86-NEXT:  .LBB7_4:
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB7_6: # %_udiv-special-cases
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    jne .LBB7_8
-; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:  .LBB7_7: # %_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB7_8: # %_udiv-special-cases
-; X86-NEXT:    movl $125, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $125, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB7_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
 ; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB7_11: # %select.end
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB7_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB7_13: # %select.end
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jne .LBB7_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB7_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
@@ -2064,26 +2058,26 @@ define i128 @rem_by_7(i128 %x) nounwind {
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movl 136(%esp,%eax), %esi
-; X86-NEXT:    movl 140(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    movl 140(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %edi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
 ; X86-NEXT:    shldl %cl, %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shldl %cl, %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB7_19
 ; X86-NEXT:  # %bb.15: # %udiv-preheader
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
@@ -2094,146 +2088,152 @@ define i128 @rem_by_7(i128 %x) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl 88(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edi
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $7, %edx
-; X86-NEXT:    addl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NEXT:    movl 84(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $7, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB7_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %esi
-; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    shldl $1, %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl $7, %eax
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    andl $1, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $7, %edx
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl $31, %ebx, %ecx
+; X86-NEXT:    shrdl $31, %eax, %ebx
+; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB7_16
 ; X86-NEXT:  .LBB7_17: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %eax
-; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    leal (%edx,%ecx,2), %ebx
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $31, %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%edi,2), %ebx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:  .LBB7_21: # %udiv-end
-; X86-NEXT:    movl $7, %ecx
-; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl $7, %edx
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $7, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl 36(%ebp), %edx
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl %esi, (%ebx)
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, 4(%ebx)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, 8(%ebx)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, 12(%ebx)
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2242,14 +2242,13 @@ define i128 @rem_by_7(i128 %x) nounwind {
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB7_9:
 ; X86-NEXT:    movb $1, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB7_11
 ; X86-NEXT:  .LBB7_19:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    jmp .LBB7_17
 ; X86-NEXT:  .LBB7_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB7_21
 ;
@@ -2287,110 +2286,107 @@ define i128 @rem_by_14(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_1
 ; X86-NEXT:  # %bb.2: # %_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB8_3
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB8_3: # %_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB8_4
 ; X86-NEXT:  # %bb.5: # %_udiv-special-cases
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB8_6
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    je .LBB8_7
+; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_4:
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB8_6: # %_udiv-special-cases
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    jne .LBB8_8
-; X86-NEXT:  # %bb.7: # %_udiv-special-cases
+; X86-NEXT:  .LBB8_7: # %_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB8_8: # %_udiv-special-cases
-; X86-NEXT:    movl $124, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $124, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB8_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
 ; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB8_11: # %select.end
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB8_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB8_13: # %select.end
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jne .LBB8_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB8_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
@@ -2398,26 +2394,26 @@ define i128 @rem_by_14(i128 %x) nounwind {
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movl 136(%esp,%eax), %esi
-; X86-NEXT:    movl 140(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    movl 140(%esp,%eax), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %edi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
 ; X86-NEXT:    shldl %cl, %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    shldl %cl, %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB8_19
 ; X86-NEXT:  # %bb.15: # %udiv-preheader
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
@@ -2428,146 +2424,152 @@ define i128 @rem_by_14(i128 %x) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl 88(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edi
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $14, %edx
-; X86-NEXT:    addl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NEXT:    movl 84(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $14, %eax
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB8_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %esi
-; X86-NEXT:    shldl $1, %eax, %ebx
+; X86-NEXT:    shldl $1, %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl $14, %eax
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    andl $1, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $14, %edx
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    shrdl $31, %ebx, %ecx
+; X86-NEXT:    shrdl $31, %eax, %ebx
+; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB8_16
 ; X86-NEXT:  .LBB8_17: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %eax
-; X86-NEXT:    shldl $1, %edi, %edx
+; X86-NEXT:    leal (%edx,%ecx,2), %ebx
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $31, %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%edi,2), %ebx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:  .LBB8_21: # %udiv-end
-; X86-NEXT:    movl $14, %ecx
-; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl $14, %edx
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $14, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $14, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl 36(%ebp), %edx
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl %esi, (%ebx)
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, 4(%ebx)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, 8(%ebx)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl %eax, 12(%ebx)
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2576,14 +2578,13 @@ define i128 @rem_by_14(i128 %x) nounwind {
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB8_9:
 ; X86-NEXT:    movb $1, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_19:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    jmp .LBB8_17
 ; X86-NEXT:  .LBB8_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB8_21
 ;
@@ -2624,95 +2625,91 @@ define i128 @div_by_67(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2: # %entry_udiv-special-cases
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:  .LBB9_3: # %entry_udiv-special-cases
-; X86-NEXT:    movl 24(%ebp), %eax
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5: # %entry_udiv-special-cases
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 24(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    je .LBB9_7
+; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:  .LBB9_6: # %entry_udiv-special-cases
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    jne .LBB9_8
-; X86-NEXT:  # %bb.7: # %entry_udiv-special-cases
+; X86-NEXT:  .LBB9_7: # %entry_udiv-special-cases
 ; X86-NEXT:    orl $64, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB9_8: # %entry_udiv-special-cases
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $121, %ebx
-; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    je .LBB9_9
 ; X86-NEXT:  # %bb.10: # %select.false.sink
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB9_11: # %select.end
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    jne .LBB9_13
 ; X86-NEXT:  # %bb.12: # %select.end
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:  .LBB9_13: # %select.end
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jne .LBB9_14
 ; X86-NEXT:  # %bb.20: # %select.end
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    xorl $127, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl $127, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB9_21
 ; X86-NEXT:  # %bb.18: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %eax
@@ -2721,12 +2718,13 @@ define i128 @div_by_67(i128 %x) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
@@ -2734,21 +2732,22 @@ define i128 @div_by_67(i128 %x) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 136(%esp,%eax), %edi
+; X86-NEXT:    movl 140(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 128(%esp,%eax), %esi
 ; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jb .LBB9_19
@@ -2769,122 +2768,127 @@ define i128 @div_by_67(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    movl 92(%esp,%edi), %eax
-; X86-NEXT:    movl 88(%esp,%edi), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NEXT:    movl 84(%esp,%edi), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 92(%esp,%edx), %eax
+; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%edx), %edi
+; X86-NEXT:    movl 84(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $67, %edx
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $67, %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB9_16: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    shldl $1, %edx, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl $67, %ecx
+; X86-NEXT:    andl %ecx, %edi
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $67, %eax
-; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ebx
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
+; X86-NEXT:    shrdl $31, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    jne .LBB9_16
 ; X86-NEXT:  .LBB9_17: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    leal (%eax,%edi,2), %ebx
+; X86-NEXT:    shrdl $31, %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $31, %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    leal (%esi,%eax,2), %edi
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    shldl $1, %eax, %esi
 ; X86-NEXT:  .LBB9_21: # %udiv-end
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2892,17 +2896,15 @@ define i128 @div_by_67(i128 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
 ; X86-NEXT:    jmp .LBB9_11
 ; X86-NEXT:  .LBB9_19:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB9_17
 ; X86-NEXT:  .LBB9_14:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB9_21
 ;
 ; X64-LABEL: div_by_67:
diff --git a/llvm/test/CodeGen/X86/i386-baseptr.ll b/llvm/test/CodeGen/X86/i386-baseptr.ll
index 777eb838b84cc..c3d72103945bb 100644
--- a/llvm/test/CodeGen/X86/i386-baseptr.ll
+++ b/llvm/test/CodeGen/X86/i386-baseptr.ll
@@ -59,7 +59,6 @@ define void @clobber_base() #0 {
 ; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    subl %eax, %edx
 ; CHECK-NEXT:    movl %edx, %esp
-; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    movl $405, %esi # imm = 0x195
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
@@ -68,6 +67,7 @@ define void @clobber_base() #0 {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    movl %edx, -120(%ebp)
 ; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    movl $0, (%ecx,%eax)
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -100,7 +100,6 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32
 ; CHECK-NEXT:    subl $128, %esp
 ; CHECK-NEXT:    movl %esp, %esi
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
-; CHECK-NEXT:    movl 8(%ebp), %edi
 ; CHECK-NEXT:    calll helper at PLT
 ; CHECK-NEXT:    movl %esp, %ecx
 ; CHECK-NEXT:    leal 31(,%eax,4), %eax
@@ -108,7 +107,6 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32
 ; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    subl %eax, %edx
 ; CHECK-NEXT:    movl %edx, %esp
-; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    movl $405, %esi # imm = 0x195
@@ -125,7 +123,9 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    movl %edx, (%esi)
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %edi, (%ecx,%eax)
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    movl 8(%ebp), %edx
+; CHECK-NEXT:    movl %edx, (%ecx,%eax)
 ; CHECK-NEXT:    leal -4(%ebp), %esp
 ; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
index 4347d626e5838..b750fc1b87efd 100644
--- a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -38,8 +38,8 @@ define i32 @eflagsLiveInPrologue() #0 {
 ; ENABLE-NEXT:    pushl %esi
 ; ENABLE-NEXT:    subl $8, %esp
 ; ENABLE-NEXT:    xorl %edx, %edx
-; ENABLE-NEXT:    cmpb $0, _d
 ; ENABLE-NEXT:    movl $6, %ecx
+; ENABLE-NEXT:    cmpb $0, _d
 ; ENABLE-NEXT:    cmovnel %edx, %ecx
 ; ENABLE-NEXT:    movl L_e$non_lazy_ptr, %edx
 ; ENABLE-NEXT:    movb %cl, (%edx)
@@ -76,8 +76,8 @@ define i32 @eflagsLiveInPrologue() #0 {
 ; DISABLE-NEXT:    jmp LBB0_3
 ; DISABLE-NEXT:  LBB0_4: ## %for.end
 ; DISABLE-NEXT:    xorl %edx, %edx
-; DISABLE-NEXT:    cmpb $0, _d
 ; DISABLE-NEXT:    movl $6, %ecx
+; DISABLE-NEXT:    cmpb $0, _d
 ; DISABLE-NEXT:    cmovnel %edx, %ecx
 ; DISABLE-NEXT:    movl L_e$non_lazy_ptr, %edx
 ; DISABLE-NEXT:    movb %cl, (%edx)
diff --git a/llvm/test/CodeGen/X86/i64-mem-copy.ll b/llvm/test/CodeGen/X86/i64-mem-copy.ll
index 4cdb079d43993..bdc802eb5c419 100644
--- a/llvm/test/CodeGen/X86/i64-mem-copy.ll
+++ b/llvm/test/CodeGen/X86/i64-mem-copy.ll
@@ -16,16 +16,16 @@ define void @foo(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X86AVX-LABEL: foo:
 ; X86AVX:       # %bb.0:
 ; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86AVX-NEXT:    retl
   %tmp1 = load i64, ptr %y, align 8
@@ -45,15 +45,15 @@ define void @store_i64_from_vector(<8 x i16> %x, <8 x i16> %y, ptr %i) nounwind
 ;
 ; X86-LABEL: store_i64_from_vector:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    paddw %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X86AVX-LABEL: store_i64_from_vector:
 ; X86AVX:       # %bb.0:
-; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vmovq %xmm0, (%eax)
 ; X86AVX-NEXT:    retl
   %z = add <8 x i16> %x, %y                          ; force execution domain
@@ -76,8 +76,8 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, ptr %i) noun
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    paddw 8(%ebp), %xmm1
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -85,10 +85,10 @@ define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, ptr %i) noun
 ;
 ; X86AVX-LABEL: store_i64_from_vector256:
 ; X86AVX:       # %bb.0:
-; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; X86AVX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; X86AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vmovq %xmm0, (%eax)
 ; X86AVX-NEXT:    vzeroupper
 ; X86AVX-NEXT:    retl
@@ -127,17 +127,17 @@ define void @PR23476(<5 x i64> %in, ptr %out, i32 %index) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movl 52(%ebp), %eax
-; X86-NEXT:    andl $7, %eax
-; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movups 24(%ebp), %xmm0
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movups 8(%ebp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esp)
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movups 8(%ebp), %xmm1
-; X86-NEXT:    movups 24(%ebp), %xmm2
-; X86-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT:    movaps %xmm1, (%esp)
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    andl $7, %eax
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movsd %xmm0, (%ecx)
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -148,15 +148,15 @@ define void @PR23476(<5 x i64> %in, ptr %out, i32 %index) nounwind {
 ; X86AVX-NEXT:    movl %esp, %ebp
 ; X86AVX-NEXT:    andl $-32, %esp
 ; X86AVX-NEXT:    subl $96, %esp
+; X86AVX-NEXT:    vmovups 8(%ebp), %ymm0
+; X86AVX-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X86AVX-NEXT:    movl 52(%ebp), %eax
 ; X86AVX-NEXT:    andl $7, %eax
-; X86AVX-NEXT:    movl 48(%ebp), %ecx
-; X86AVX-NEXT:    vmovups 8(%ebp), %ymm1
-; X86AVX-NEXT:    vmovaps %ymm1, (%esp)
-; X86AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
 ; X86AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86AVX-NEXT:    vmovsd %xmm0, (%ecx)
+; X86AVX-NEXT:    movl 48(%ebp), %eax
+; X86AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86AVX-NEXT:    movl %ebp, %esp
 ; X86AVX-NEXT:    popl %ebp
 ; X86AVX-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll
index 5cb1b8f4a89e4..e7ed42a2e41c6 100644
--- a/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i686-win-shrink-wrapping.ll
@@ -12,8 +12,8 @@ define x86_thiscallcc void @stackRealignment(ptr %this) {
 ; SHRINK-WRAP-LABEL: stackRealignment:
 ; SHRINK-WRAP:       # %bb.0: # %entry
 ; SHRINK-WRAP-NEXT:    movl (%ecx), %eax
-; SHRINK-WRAP-NEXT:    cmpl $33, %eax
 ; SHRINK-WRAP-NEXT:    movl $42, %edx
+; SHRINK-WRAP-NEXT:    cmpl $33, %eax
 ; SHRINK-WRAP-NEXT:    jge LBB0_2
 ; SHRINK-WRAP-NEXT:  # %bb.1: # %entry
 ; SHRINK-WRAP-NEXT:    movl $128, %edx
@@ -43,8 +43,8 @@ define x86_thiscallcc void @stackRealignment(ptr %this) {
 ; NO-SHRINK-WRAP-NEXT:    andl $-8, %esp
 ; NO-SHRINK-WRAP-NEXT:    subl $16, %esp
 ; NO-SHRINK-WRAP-NEXT:    movl (%ecx), %eax
-; NO-SHRINK-WRAP-NEXT:    cmpl $33, %eax
 ; NO-SHRINK-WRAP-NEXT:    movl $42, %edx
+; NO-SHRINK-WRAP-NEXT:    cmpl $33, %eax
 ; NO-SHRINK-WRAP-NEXT:    jge LBB0_2
 ; NO-SHRINK-WRAP-NEXT:  # %bb.1: # %entry
 ; NO-SHRINK-WRAP-NEXT:    movl $128, %edx
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 3c3171032451a..1a161d4bba2ab 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -125,31 +125,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 36(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    subl %edx, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll
index 71893a9e4be67..17406da5f2e7f 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll
@@ -14,29 +14,32 @@ define i64 @eq_or_with_dom_abs(i64 %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl $64, %esi
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    sete %bl
+; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    movl $2344, %edi # imm = 0x928
 ; X86-NEXT:    cmpl %eax, %edi
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    jb .LBB0_2
 ; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    jb .LBB0_4
+; X86-NEXT:  # %bb.3:
 ; X86-NEXT:    movb %bl, %cl
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB0_2:
+; X86-NEXT:  .LBB0_4:
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -69,16 +72,15 @@ define i64 @eq_or_with_dom_abs(i64 %x) nounwind {
 define i32 @eq_or_with_dom_abs_non_po2(i32 %x) nounwind {
 ; X86-LABEL: eq_or_with_dom_abs_non_po2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl $123, %edx
+; X86-NEXT:    cmpl $123, %eax
 ; X86-NEXT:    sete %dl
+; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
 ; X86-NEXT:    cmpl $2345, %eax # imm = 0x929
 ; X86-NEXT:    jae .LBB1_2
 ; X86-NEXT:  # %bb.1:
@@ -114,15 +116,14 @@ define i32 @eq_or_with_dom_abs_non_po2(i32 %x) nounwind {
 define i8 @ne_and_with_dom_abs_non_pow2(i8 %x) nounwind {
 ; X86-LABEL: ne_and_with_dom_abs_non_pow2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarb $7, %al
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    xorb $12, %al
-; X86-NEXT:    cmpb $121, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarb $7, %cl
+; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    cmpb $121, %al
 ; X86-NEXT:    setne %cl
+; X86-NEXT:    xorb $12, %al
 ; X86-NEXT:    cmpb $24, %al
 ; X86-NEXT:    jae .LBB2_2
 ; X86-NEXT:  # %bb.1:
@@ -166,12 +167,11 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind {
 ; X86-NEXT:    sarl $15, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
-; X86-NEXT:    movzwl %ax, %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpw $64, %dx
+; X86-NEXT:    cmpw $64, %ax
 ; X86-NEXT:    setne %dl
+; X86-NEXT:    xorl $12312, %eax # imm = 0x3018
+; X86-NEXT:    movzwl %ax, %esi
 ; X86-NEXT:    cmpl $2345, %esi # imm = 0x929
 ; X86-NEXT:    jae .LBB3_2
 ; X86-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
index b14fedeaae57d..4f0885075ac3a 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll
@@ -54,9 +54,9 @@ define i1 @eq_pow_mismatch_or(i32 %0) nounwind {
 ; X86-LABEL: eq_pow_mismatch_or:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $16, %eax
-; X86-NEXT:    sete %cl
 ; X86-NEXT:    cmpl $-32, %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    cmpl $16, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    retl
@@ -79,9 +79,9 @@ define i1 @ne_non_pow_and(i8 %0) nounwind {
 ; X86-LABEL: ne_non_pow_and:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $17, %al
-; X86-NEXT:    setne %cl
 ; X86-NEXT:    cmpb $-17, %al
+; X86-NEXT:    setne %cl
+; X86-NEXT:    cmpb $17, %al
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    retl
@@ -105,8 +105,8 @@ define i1 @ne_pow_or(i32 %0) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $32, %ecx
-; X86-NEXT:    xorl $-32, %eax
+; X86-NEXT:    xorl $-32, %ecx
+; X86-NEXT:    xorl $32, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -130,8 +130,8 @@ define i1 @eq_pow_and(i8 %0) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorb $16, %cl
-; X86-NEXT:    xorb $-16, %al
+; X86-NEXT:    xorb $-16, %cl
+; X86-NEXT:    xorb $16, %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -223,32 +223,30 @@ define i1 @abs_ne_nonpow2(i16 %0) nounwind {
 define <2 x i1> @abs_ne_vec(<2 x i64> %0) nounwind {
 ; X86-LABEL: abs_ne_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    subl %esi, %edi
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    xorl $8, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    setne %al
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    xorl $8, %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    xorl $8, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abs_ne_vec:
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 0296c2a011e25..76175ae6acbcf 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -141,12 +141,12 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %ecx
-; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl 16(%ebp), %eax
-; X86-NEXT:    orl 12(%ebp), %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    shll $17, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    orl 16(%ebp), %ecx
+; X86-NEXT:    orl 12(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -170,12 +170,12 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %ecx
-; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl 16(%ebp), %eax
-; X86-NEXT:    orl 12(%ebp), %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    shll $17, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    orl 16(%ebp), %ecx
+; X86-NEXT:    orl 12(%ebp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -203,24 +203,24 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    shldl $17, %edx, %esi
-; X86-NEXT:    shldl $17, %ecx, %edx
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shldl $17, %eax, %ecx
-; X86-NEXT:    shll $17, %eax
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    shldl $17, %esi, %eax
+; X86-NEXT:    shldl $17, %edx, %esi
+; X86-NEXT:    shll $17, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %eax, %ebx
 ; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    sete %bl
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll use at PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %ebx, %eax
@@ -256,9 +256,9 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $17, %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
@@ -285,16 +285,16 @@ define i1 @opt_setcc_expanded_shl_wrong_shifts(i64 %a, i64 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shldl $17, %esi, %ecx
 ; X86-NEXT:    shldl $17, %edx, %esi
-; X86-NEXT:    shldl $17, %ecx, %edx
-; X86-NEXT:    shldl $18, %eax, %ecx
+; X86-NEXT:    shldl $18, %eax, %edx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    shll $18, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 338163d0e1cff..6da5470a5b719 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -52,21 +52,16 @@ define void @i24_and_or(ptr %a) {
 define void @i24_insert_bit(ptr %a, i1 zeroext %bit) {
 ; X86-LABEL: i24_insert_bit:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzbl 2(%eax), %edx
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    andl $16769023, %edx # imm = 0xFFDFFF
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzbl 2(%eax), %esi
-; X86-NEXT:    shll $16, %esi
-; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    shll $13, %ecx
-; X86-NEXT:    andl $16769023, %esi # imm = 0xFFDFFF
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movw %si, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_insert_bit:
@@ -143,10 +138,10 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
 ; X86-LABEL: i56_insert_bit:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $13, %ecx
-; X86-NEXT:    movl $-8193, %edx # imm = 0xDFFF
-; X86-NEXT:    andl (%eax), %edx
+; X86-NEXT:    movl $-8193, %ecx # imm = 0xDFFF
+; X86-NEXT:    andl (%eax), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $13, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll
index 348f3a3be38ae..a9f228f01b4a5 100644
--- a/llvm/test/CodeGen/X86/immediate_merging.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging.ll
@@ -18,18 +18,18 @@ define dso_local i32 @foo() optsize {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl $1234, %eax # imm = 0x4D2
-; X86-NEXT:    movl %eax, a
 ; X86-NEXT:    movl %eax, b
+; X86-NEXT:    movl %eax, a
 ; X86-NEXT:    movl $12, c
 ; X86-NEXT:    cmpl $12, e
 ; X86-NEXT:    jne .LBB0_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl $1, x
 ; X86-NEXT:  .LBB0_2: # %if.end
-; X86-NEXT:    movl $1234, f # imm = 0x4D2
 ; X86-NEXT:    movl $555, %eax # imm = 0x22B
 ; X86-NEXT:    movl %eax, h
 ; X86-NEXT:    addl %eax, i
+; X86-NEXT:    movl $1234, f # imm = 0x4D2
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
@@ -79,18 +79,18 @@ define dso_local i32 @foo_pgso() !prof !14 {
 ; X86-LABEL: foo_pgso:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl $1234, %eax # imm = 0x4D2
-; X86-NEXT:    movl %eax, a
 ; X86-NEXT:    movl %eax, b
+; X86-NEXT:    movl %eax, a
 ; X86-NEXT:    movl $12, c
 ; X86-NEXT:    cmpl $12, e
 ; X86-NEXT:    jne .LBB1_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    movl $1, x
 ; X86-NEXT:  .LBB1_2: # %if.end
-; X86-NEXT:    movl $1234, f # imm = 0x4D2
 ; X86-NEXT:    movl $555, %eax # imm = 0x22B
 ; X86-NEXT:    movl %eax, h
 ; X86-NEXT:    addl %eax, i
+; X86-NEXT:    movl $1234, f # imm = 0x4D2
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
@@ -137,8 +137,8 @@ if.end:                                           ; preds = %if.then, %entry
 define dso_local i32 @foo2() {
 ; X86-LABEL: foo2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $1234, a # imm = 0x4D2
 ; X86-NEXT:    movl $1234, b # imm = 0x4D2
+; X86-NEXT:    movl $1234, a # imm = 0x4D2
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/inalloca-stdcall.ll b/llvm/test/CodeGen/X86/inalloca-stdcall.ll
index d18976ee2ddf5..fd10fc3dde477 100644
--- a/llvm/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/llvm/test/CodeGen/X86/inalloca-stdcall.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
 
 %Foo = type { i32, i32 }
@@ -6,21 +7,26 @@ declare x86_stdcallcc void @f(ptr inalloca(%Foo) %a)
 declare x86_stdcallcc void @i(i32 %a)
 
 define void @g() {
-; CHECK-LABEL: _g:
+; CHECK-LABEL: g:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl $42, 4(%eax)
+; CHECK-NEXT:    movl $13, (%eax)
+; CHECK-NEXT:    calll _f at 8
+; CHECK-NEXT:    pushl $0
+; CHECK-NEXT:    calll _i at 4
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
   %b = alloca inalloca %Foo
-; CHECK: pushl   %eax
-; CHECK: pushl   %eax
   %f2 = getelementptr %Foo, ptr %b, i32 0, i32 1
   store i32 13, ptr %b
   store i32 42, ptr %f2
-; CHECK: movl %esp, %eax
-; CHECK: movl    $13, (%eax)
-; CHECK: movl    $42, 4(%eax)
   call x86_stdcallcc void @f(ptr inalloca(%Foo) %b)
-; CHECK: calll   _f at 8
-; CHECK-NOT: %esp
-; CHECK: pushl
-; CHECK: calll   _i at 4
   call x86_stdcallcc void @i(i32 0)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/inalloca.ll b/llvm/test/CodeGen/X86/inalloca.ll
index fafc7d97eb5c0..4cb61b773f27f 100644
--- a/llvm/test/CodeGen/X86/inalloca.ll
+++ b/llvm/test/CodeGen/X86/inalloca.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
 
 %Foo = type { i32, i32 }
@@ -5,58 +6,76 @@
 declare void @f(ptr inalloca(%Foo) %b)
 
 define void @a() {
-; CHECK-LABEL: _a:
+; CHECK-LABEL: a:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl $42, 4(%eax)
+; CHECK-NEXT:    movl $13, (%eax)
+; CHECK-NEXT:    calll _f
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %b = alloca inalloca %Foo
-; CHECK: pushl   %eax
-; CHECK: pushl   %eax
   %f2 = getelementptr %Foo, ptr %b, i32 0, i32 1
   store i32 13, ptr %b
   store i32 42, ptr %f2
-; CHECK: movl %esp, %eax
-; CHECK: movl    $13, (%eax)
-; CHECK: movl    $42, 4(%eax)
   call void @f(ptr inalloca(%Foo) %b)
-; CHECK: calll   _f
   ret void
 }
 
 declare void @inreg_with_inalloca(i32 inreg %a, ptr inalloca(%Foo) %b)
 
 define void @b() {
-; CHECK-LABEL: _b:
+; CHECK-LABEL: b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl $42, 4(%eax)
+; CHECK-NEXT:    movl $13, (%eax)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    calll _inreg_with_inalloca
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %b = alloca inalloca %Foo
-; CHECK: pushl   %eax
-; CHECK: pushl   %eax
   %f2 = getelementptr %Foo, ptr %b, i32 0, i32 1
   store i32 13, ptr %b
   store i32 42, ptr %f2
-; CHECK: movl %esp, %eax
-; CHECK: movl    $13, (%eax)
-; CHECK: movl    $42, 4(%eax)
   call void @inreg_with_inalloca(i32 inreg 1, ptr inalloca(%Foo) %b)
-; CHECK: movl    $1, %eax
-; CHECK: calll   _inreg_with_inalloca
   ret void
 }
 
 declare x86_thiscallcc void @thiscall_with_inalloca(ptr %a, ptr inalloca(%Foo) %b)
 
 define void @c() {
-; CHECK-LABEL: _c:
+; CHECK-LABEL: c:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl $42, 4(%eax)
+; CHECK-NEXT:    movl $13, (%eax)
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    calll _thiscall_with_inalloca
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %b = alloca inalloca %Foo
-; CHECK: pushl   %eax
-; CHECK: pushl   %eax
   %f2 = getelementptr %Foo, ptr %b, i32 0, i32 1
   store i32 13, ptr %b
   store i32 42, ptr %f2
-; CHECK: movl %esp, %eax
-; CHECK-DAG: movl    $13, (%eax)
-; CHECK-DAG: movl    $42, 4(%eax)
   call x86_thiscallcc void @thiscall_with_inalloca(ptr null, ptr inalloca(%Foo) %b)
-; CHECK-DAG: xorl    %ecx, %ecx
-; CHECK: calll   _thiscall_with_inalloca
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/inline-asm-fpstack.ll b/llvm/test/CodeGen/X86/inline-asm-fpstack.ll
index af188ef3a2cf8..2794ffad36487 100644
--- a/llvm/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -70,19 +70,19 @@ define void @test6(double %A, double %B, double %C, double %D, double %E) nounwi
 ; CHECK-LABEL: test6:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    foo %st %st
 ; CHECK-NEXT:    ## InlineAsm End
 ; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    bar %st(1) %st
 ; CHECK-NEXT:    ## InlineAsm End
 ; CHECK-NEXT:    fstp %st(1)
 ; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    baz %st(1) %st
 ; CHECK-NEXT:    ## InlineAsm End
@@ -461,8 +461,8 @@ define void @test_live_st(i32 %a1) nounwind {
 ; CHECK-NEXT:    fldcw (%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fildl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl L_fpu$non_lazy_ptr, %eax
+; CHECK-NEXT:    fildl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fstpt 128(%eax)
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/insert.ll b/llvm/test/CodeGen/X86/insert.ll
index 381de2ecaa164..12a53ad4d6fda 100644
--- a/llvm/test/CodeGen/X86/insert.ll
+++ b/llvm/test/CodeGen/X86/insert.ll
@@ -6,9 +6,10 @@ define i64 @sub8(i64 noundef %res, ptr %byte) {
 ; X86-LABEL: sub8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb (%ecx), %al
+; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub8:
@@ -27,12 +28,12 @@ entry:
 define i64 @sub16(i64 noundef %res, ptr %byte) {
 ; X86-LABEL: sub16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub16:
@@ -52,8 +53,9 @@ define i32 @sub8_32(i32 noundef %res, ptr %byte) {
 ; X86-LABEL: sub8_32:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb (%ecx), %al
+; X86-NEXT:    movzbl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub8_32:
@@ -72,10 +74,10 @@ entry:
 define i32 @sub16_32(i32 noundef %res, ptr %byte) {
 ; X86-LABEL: sub16_32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
index 435ea61412b73..5da04ea382830 100644
--- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll
+++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
@@ -8,12 +8,12 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
 ; SSE-32-LABEL: PR15298:
 ; SSE-32:       # %bb.0: # %L.entry
 ; SSE-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE-32-NEXT:    xorps %xmm0, %xmm0
 ; SSE-32-NEXT:    xorps %xmm1, %xmm1
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0]
-; SSE-32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-32-NEXT:    movups %xmm0, 624(%eax)
+; SSE-32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
 ; SSE-32-NEXT:    movups %xmm1, 608(%eax)
 ; SSE-32-NEXT:    retl
 ;
@@ -30,10 +30,10 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
 ; AVX-32-LABEL: PR15298:
 ; AVX-32:       # %bb.0: # %L.entry
 ; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT:    vbroadcastss 304(%ecx), %xmm0
+; AVX-32-NEXT:    vbroadcastss 304(%eax), %xmm0
 ; AVX-32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-32-NEXT:    vmovups %ymm0, 608(%eax)
 ; AVX-32-NEXT:    vzeroupper
 ; AVX-32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/insertelement-legalize.ll b/llvm/test/CodeGen/X86/insertelement-legalize.ll
index 67f824ff8412d..febce95ce83b8 100644
--- a/llvm/test/CodeGen/X86/insertelement-legalize.ll
+++ b/llvm/test/CodeGen/X86/insertelement-legalize.ll
@@ -5,23 +5,21 @@
 define void @test(<2 x i64> %val, ptr %dst, i64 %x) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    addl %ecx, %ecx
-; CHECK-NEXT:    adcl %edx, %edx
 ; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    movl %esi, (%eax)
+; CHECK-NEXT:    adcl %edx, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, (%eax)
 ; CHECK-NEXT:    movl %edx, 12(%eax)
-; CHECK-NEXT:    movl %edi, 4(%eax)
+; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl %esi, 4(%eax)
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
 	%tmp4 = insertelement <2 x i64> %val, i64 %x, i32 0		; <<2 x i64>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index d151c6f28e51b..77bf11244283c 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -863,10 +863,10 @@ define <16 x i8> @arg_i8_v16i8(<16 x i8> %v, i8 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
+; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $15, %eax
 ; X86AVX2-NEXT:    movzbl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
@@ -917,10 +917,10 @@ define <8 x i16> @arg_i16_v8i16(<8 x i16> %v, i16 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
+; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $7, %eax
 ; X86AVX2-NEXT:    movzwl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
@@ -962,10 +962,10 @@ define <4 x i32> @arg_i32_v4i32(<4 x i32> %v, i32 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
+; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $3, %eax
 ; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
@@ -1009,19 +1009,19 @@ define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    pushl %esi
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $48, %esp
-; X86AVX2-NEXT:    movl 8(%ebp), %edx
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    movl 16(%ebp), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    addl %ecx, %ecx
-; X86AVX2-NEXT:    movl %ecx, %esi
-; X86AVX2-NEXT:    andl $3, %esi
-; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
+; X86AVX2-NEXT:    movl 16(%ebp), %eax
+; X86AVX2-NEXT:    addl %eax, %eax
+; X86AVX2-NEXT:    movl %eax, %ecx
+; X86AVX2-NEXT:    andl $3, %ecx
+; X86AVX2-NEXT:    movl 8(%ebp), %edx
+; X86AVX2-NEXT:    movl 12(%ebp), %esi
+; X86AVX2-NEXT:    movl %edx, (%esp,%ecx,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    incl %ecx
-; X86AVX2-NEXT:    andl $3, %ecx
-; X86AVX2-NEXT:    movl %eax, 16(%esp,%ecx,4)
+; X86AVX2-NEXT:    incl %eax
+; X86AVX2-NEXT:    andl $3, %eax
+; X86AVX2-NEXT:    movl %esi, 16(%esp,%eax,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
 ; X86AVX2-NEXT:    leal -4(%ebp), %esp
 ; X86AVX2-NEXT:    popl %esi
@@ -1142,11 +1142,11 @@ define <2 x double> @arg_f64_v2f64(<2 x double> %v, double %x, i32 %y) nounwind
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
+; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
 ; X86AVX2-NEXT:    movl 16(%ebp), %eax
 ; X86AVX2-NEXT:    andl $1, %eax
-; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
+; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86AVX2-NEXT:    vmovsd %xmm0, (%esp,%eax,8)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -1199,12 +1199,12 @@ define <16 x i8> @load_i8_v16i8(<16 x i8> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    andl $15, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movzbl (%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movzbl (%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %ecx
+; X86AVX2-NEXT:    andl $15, %ecx
+; X86AVX2-NEXT:    movb %al, (%esp,%ecx)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -1258,12 +1258,12 @@ define <8 x i16> @load_i16_v8i16(<8 x i16> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    andl $7, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movzwl (%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movzwl (%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %ecx
+; X86AVX2-NEXT:    andl $7, %ecx
+; X86AVX2-NEXT:    movw %ax, (%esp,%ecx,2)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -1307,12 +1307,12 @@ define <4 x i32> @load_i32_v4i32(<4 x i32> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    andl $3, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movl (%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movl (%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %ecx
+; X86AVX2-NEXT:    andl $3, %ecx
+; X86AVX2-NEXT:    movl %eax, (%esp,%ecx,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -1358,20 +1358,20 @@ define <2 x i64> @load_i64_v2i64(<2 x i64> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    pushl %esi
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $48, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movl (%ecx), %edx
-; X86AVX2-NEXT:    movl 4(%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    addl %eax, %eax
-; X86AVX2-NEXT:    movl %eax, %esi
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movl (%eax), %ecx
+; X86AVX2-NEXT:    movl 4(%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %edx
+; X86AVX2-NEXT:    addl %edx, %edx
+; X86AVX2-NEXT:    movl %edx, %esi
 ; X86AVX2-NEXT:    andl $3, %esi
-; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
+; X86AVX2-NEXT:    movl %ecx, (%esp,%esi,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    incl %eax
-; X86AVX2-NEXT:    andl $3, %eax
-; X86AVX2-NEXT:    movl %ecx, 16(%esp,%eax,4)
+; X86AVX2-NEXT:    incl %edx
+; X86AVX2-NEXT:    andl $3, %edx
+; X86AVX2-NEXT:    movl %eax, 16(%esp,%edx,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
 ; X86AVX2-NEXT:    leal -4(%ebp), %esp
 ; X86AVX2-NEXT:    popl %esi
@@ -1432,9 +1432,9 @@ define <4 x float> @load_f32_v4f32(<4 x float> %v, ptr %p, i32 %y) nounwind {
 ;
 ; X86AVX2-LABEL: load_f32_v4f32:
 ; X86AVX2:       # %bb.0:
-; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX2-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
 ; X86AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX2-NEXT:    vbroadcastss (%eax), %xmm2
 ; X86AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; X86AVX2-NEXT:    retl
@@ -1498,12 +1498,12 @@ define <2 x double> @load_f64_v2f64(<2 x double> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-16, %esp
 ; X86AVX2-NEXT:    subl $32, %esp
+; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $1, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
-; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
+; X86AVX2-NEXT:    vmovsd %xmm0, (%esp,%eax,8)
 ; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -1568,10 +1568,10 @@ define <32 x i8> @arg_i8_v32i8(<32 x i8> %v, i8 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
+; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $31, %eax
 ; X86AVX2-NEXT:    movzbl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
@@ -1636,10 +1636,10 @@ define <16 x i16> @arg_i16_v16i16(<16 x i16> %v, i16 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
+; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $15, %eax
 ; X86AVX2-NEXT:    movzwl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
@@ -1689,10 +1689,10 @@ define <8 x i32> @arg_i32_v8i32(<8 x i32> %v, i32 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
+; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $7, %eax
 ; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
@@ -1744,19 +1744,19 @@ define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {
 ; X86AVX2-NEXT:    pushl %esi
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $96, %esp
-; X86AVX2-NEXT:    movl 8(%ebp), %edx
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    movl 16(%ebp), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    addl %ecx, %ecx
-; X86AVX2-NEXT:    movl %ecx, %esi
-; X86AVX2-NEXT:    andl $7, %esi
-; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
+; X86AVX2-NEXT:    movl 16(%ebp), %eax
+; X86AVX2-NEXT:    addl %eax, %eax
+; X86AVX2-NEXT:    movl %eax, %ecx
+; X86AVX2-NEXT:    andl $7, %ecx
+; X86AVX2-NEXT:    movl 8(%ebp), %edx
+; X86AVX2-NEXT:    movl 12(%ebp), %esi
+; X86AVX2-NEXT:    movl %edx, (%esp,%ecx,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    incl %ecx
-; X86AVX2-NEXT:    andl $7, %ecx
-; X86AVX2-NEXT:    movl %eax, 32(%esp,%ecx,4)
+; X86AVX2-NEXT:    incl %eax
+; X86AVX2-NEXT:    andl $7, %eax
+; X86AVX2-NEXT:    movl %esi, 32(%esp,%eax,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %ymm0
 ; X86AVX2-NEXT:    leal -4(%ebp), %esp
 ; X86AVX2-NEXT:    popl %esi
@@ -1866,11 +1866,11 @@ define <4 x double> @arg_f64_v4f64(<4 x double> %v, double %x, i32 %y) nounwind
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
+; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
 ; X86AVX2-NEXT:    movl 16(%ebp), %eax
 ; X86AVX2-NEXT:    andl $3, %eax
-; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
+; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86AVX2-NEXT:    vmovsd %xmm0, (%esp,%eax,8)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -1937,12 +1937,12 @@ define <32 x i8> @load_i8_v32i8(<32 x i8> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    andl $31, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movzbl (%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movzbl (%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %ecx
+; X86AVX2-NEXT:    andl $31, %ecx
+; X86AVX2-NEXT:    movb %al, (%esp,%ecx)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -2010,12 +2010,12 @@ define <16 x i16> @load_i16_v16i16(<16 x i16> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    andl $15, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movzwl (%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movzwl (%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %ecx
+; X86AVX2-NEXT:    andl $15, %ecx
+; X86AVX2-NEXT:    movw %ax, (%esp,%ecx,2)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -2067,12 +2067,12 @@ define <8 x i32> @load_i32_v8i32(<8 x i32> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    andl $7, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movl (%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movl (%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %ecx
+; X86AVX2-NEXT:    andl $7, %ecx
+; X86AVX2-NEXT:    movl %eax, (%esp,%ecx,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -2126,20 +2126,20 @@ define <4 x i64> @load_i64_v4i64(<4 x i64> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    pushl %esi
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $96, %esp
-; X86AVX2-NEXT:    movl 12(%ebp), %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    movl (%ecx), %edx
-; X86AVX2-NEXT:    movl 4(%ecx), %ecx
 ; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    addl %eax, %eax
-; X86AVX2-NEXT:    movl %eax, %esi
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    movl (%eax), %ecx
+; X86AVX2-NEXT:    movl 4(%eax), %eax
+; X86AVX2-NEXT:    movl 12(%ebp), %edx
+; X86AVX2-NEXT:    addl %edx, %edx
+; X86AVX2-NEXT:    movl %edx, %esi
 ; X86AVX2-NEXT:    andl $7, %esi
-; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
+; X86AVX2-NEXT:    movl %ecx, (%esp,%esi,4)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X86AVX2-NEXT:    incl %eax
-; X86AVX2-NEXT:    andl $7, %eax
-; X86AVX2-NEXT:    movl %ecx, 32(%esp,%eax,4)
+; X86AVX2-NEXT:    incl %edx
+; X86AVX2-NEXT:    andl $7, %edx
+; X86AVX2-NEXT:    movl %eax, 32(%esp,%edx,4)
 ; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %ymm0
 ; X86AVX2-NEXT:    leal -4(%ebp), %esp
 ; X86AVX2-NEXT:    popl %esi
@@ -2192,9 +2192,9 @@ define <8 x float> @load_f32_v8f32(<8 x float> %v, ptr %p, i32 %y) nounwind {
 ;
 ; X86AVX2-LABEL: load_f32_v8f32:
 ; X86AVX2:       # %bb.0:
-; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX2-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm1
 ; X86AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
+; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX2-NEXT:    vbroadcastss (%eax), %ymm2
 ; X86AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
 ; X86AVX2-NEXT:    retl
@@ -2252,12 +2252,12 @@ define <4 x double> @load_f64_v4f64(<4 x double> %v, ptr %p, i32 %y) nounwind {
 ; X86AVX2-NEXT:    movl %esp, %ebp
 ; X86AVX2-NEXT:    andl $-32, %esp
 ; X86AVX2-NEXT:    subl $64, %esp
+; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
+; X86AVX2-NEXT:    movl 8(%ebp), %eax
+; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86AVX2-NEXT:    movl 12(%ebp), %eax
 ; X86AVX2-NEXT:    andl $3, %eax
-; X86AVX2-NEXT:    movl 8(%ebp), %ecx
-; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
+; X86AVX2-NEXT:    vmovsd %xmm0, (%esp,%eax,8)
 ; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
 ; X86AVX2-NEXT:    movl %ebp, %esp
 ; X86AVX2-NEXT:    popl %ebp
@@ -2357,13 +2357,13 @@ define i32 @PR44139(ptr %p) {
 ; X86AVX2:       # %bb.0:
 ; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86AVX2-NEXT:    vbroadcastsd (%ecx), %ymm0
-; X86AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; X86AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; X86AVX2-NEXT:    vmovaps %ymm0, 64(%ecx)
 ; X86AVX2-NEXT:    vmovaps %ymm0, 96(%ecx)
 ; X86AVX2-NEXT:    vmovaps %ymm0, 32(%ecx)
+; X86AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
+; X86AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; X86AVX2-NEXT:    movl (%ecx), %eax
-; X86AVX2-NEXT:    vmovaps %ymm1, (%ecx)
+; X86AVX2-NEXT:    vmovaps %ymm0, (%ecx)
 ; X86AVX2-NEXT:    leal 2147483647(%eax), %ecx
 ; X86AVX2-NEXT:    testl %eax, %eax
 ; X86AVX2-NEXT:    cmovnsl %eax, %ecx
diff --git a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
index 93b60c27255f3..9445d617cdd5f 100644
--- a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
+++ b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X32
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=X64
 
@@ -9,9 +10,9 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    movaps (%eax), %xmm0
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    movaps (%ecx), %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    addps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
index cdde03fb0534b..cef425298474f 100644
--- a/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
+++ b/llvm/test/CodeGen/X86/int-to-fp-demanded.ll
@@ -86,10 +86,10 @@ define <2 x i16> @sitofp_signbit_only_fail_bad_width2(i32 %i_in) nounwind {
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    fildl (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    andl $32768, %eax # imm = 0x8000
+; X86-NEXT:    shrl $16, %edx
 ; X86-NEXT:    andl $32768, %edx # imm = 0x8000
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    # kill: def $dx killed $dx killed $edx
@@ -139,8 +139,8 @@ define i32 @sitofp_multiuse_fail(i32 %i_in) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fildl {{[0-9]+}}(%esp)
-; X86-NEXT:    fsts {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    fsts (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll use.i32 at PLT
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
@@ -172,8 +172,8 @@ define i32 @sitofp_multiuse_okay(i32 %i_in) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fildl {{[0-9]+}}(%esp)
-; X86-NEXT:    fsts {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    fsts (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll use.i1 at PLT
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/invpcid-intrinsic.ll b/llvm/test/CodeGen/X86/invpcid-intrinsic.ll
index 66f7855239c06..aec1af3b663cf 100644
--- a/llvm/test/CodeGen/X86/invpcid-intrinsic.ll
+++ b/llvm/test/CodeGen/X86/invpcid-intrinsic.ll
@@ -31,9 +31,9 @@ define void @test_invpcid2(ptr readonly %type, ptr %descriptor) {
 ; X86-LABEL: test_invpcid2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    invpcid (%eax), %ecx
+; X86-NEXT:    invpcid (%ecx), %eax
 ; X86-NEXT:    retl
 ;
 ; X86_64-LABEL: test_invpcid2:
diff --git a/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll b/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll
index 05d3f70820fb0..76ec9d705ae9e 100644
--- a/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll
+++ b/llvm/test/CodeGen/X86/ipra-local-linkage-2.ll
@@ -35,11 +35,27 @@ define internal void @callee_clobber_rbx(ptr %addr) nounwind norecurse {
 ; X64-NEXT:    #NO_APP
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
+;
+; X86-LABEL: callee_clobber_rbx:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    #APP
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
   call void asm sideeffect "xor %ebx, %ebx", "~{ebx}"()
   ret void
 }
 
 define internal void @callee_clobber_esi(ptr %addr) nounwind norecurse {
+; X64-LABEL: callee_clobber_esi:
+; X64:       # %bb.0:
+; X64-NEXT:    #APP
+; X64-NEXT:    xorl %esi, %esi
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    retq
+;
 ; X86-LABEL: callee_clobber_esi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -109,6 +125,36 @@ define void @caller_use_rbx(i32 %X) nounwind ssp {
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB4_2:
 ; X64-NEXT:    callq __stack_chk_fail at PLT
+;
+; X86-LABEL: caller_use_rbx:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-32, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl %esp, %esi
+; X86-NEXT:    movl __stack_chk_guard, %eax
+; X86-NEXT:    movl %eax, 16(%esi)
+; X86-NEXT:    movl %esp, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, %esp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll callee_clobber_rbx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl __stack_chk_guard, %eax
+; X86-NEXT:    cmpl 16(%esi), %eax
+; X86-NEXT:    jne .LBB4_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_2:
+; X86-NEXT:    calll __stack_chk_fail
   %realign = alloca i32, align 32
   %addr = alloca i32, i32 %X
   call void @callee_clobber_rbx(ptr %realign)
@@ -116,6 +162,35 @@ define void @caller_use_rbx(i32 %X) nounwind ssp {
 }
 
 define void @caller_use_esi(i32 %X) nounwind ssp {
+; X64-LABEL: caller_use_esi:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    movq %rsp, %rbx
+; X64-NEXT:    movq __stack_chk_guard(%rip), %rax
+; X64-NEXT:    movq %rax, 32(%rbx)
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    leaq 15(,%rcx,4), %rcx
+; X64-NEXT:    andq $-16, %rcx
+; X64-NEXT:    subq %rcx, %rax
+; X64-NEXT:    movq %rax, %rsp
+; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    callq callee_clobber_esi
+; X64-NEXT:    movq __stack_chk_guard(%rip), %rax
+; X64-NEXT:    cmpq 32(%rbx), %rax
+; X64-NEXT:    jne .LBB5_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    leaq -8(%rbp), %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB5_2:
+; X64-NEXT:    callq __stack_chk_fail at PLT
+;
 ; X86-LABEL: caller_use_esi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
@@ -124,13 +199,13 @@ define void @caller_use_esi(i32 %X) nounwind ssp {
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl __stack_chk_guard, %ecx
-; X86-NEXT:    movl %ecx, 16(%esi)
-; X86-NEXT:    movl %esp, %ecx
-; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %esp
+; X86-NEXT:    movl __stack_chk_guard, %eax
+; X86-NEXT:    movl %eax, 16(%esi)
+; X86-NEXT:    movl %esp, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, %esp
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll callee_clobber_esi
diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll
index 2d875d7689529..22f6c3beec766 100644
--- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll
@@ -27,17 +27,18 @@ entry:
 define i1 @is_nan_f80_strict(x86_fp80 %x) nounwind strictfp {
 ; X86-LABEL: is_nan_f80_strict:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl $32767, %edx # imm = 0x7FFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl $32767, %esi # imm = 0x7FFF
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    setl %dl
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    sete %cl
@@ -47,6 +48,7 @@ define i1 @is_nan_f80_strict(x86_fp80 %x) nounwind strictfp {
 ; X86-NEXT:    orb %dl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_nan_f80_strict:
@@ -74,27 +76,27 @@ entry:
 define i1 @is_snan_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_snan_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl $32767, %esi # imm = 0x7FFF
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    setl %bl
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    cmpl $-1073741824, %edx # imm = 0xC0000000
 ; X86-NEXT:    sbbl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    setl %al
-; X86-NEXT:    andb %bl, %al
+; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_snan_f80:
@@ -122,15 +124,17 @@ entry:
 define i1 @is_qnan_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_qnan_f80:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl $-1073741825, %edx # imm = 0xBFFFFFFF
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl $32767, %edx # imm = 0x7FFF
+; X86-NEXT:    movl $-1073741825, %esi # imm = 0xBFFFFFFF
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    setl %al
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_qnan_f80:
@@ -237,13 +241,13 @@ entry:
 define i1 @is_inf_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_inf_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
@@ -265,12 +269,12 @@ entry:
 define i1 @is_posinf_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_posinf_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
@@ -291,12 +295,12 @@ entry:
 define i1 @is_neginf_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_neginf_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl $65535, %ecx # imm = 0xFFFF
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
@@ -317,15 +321,15 @@ entry:
 define i1 @is_normal_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_normal_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    decl %ecx
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl $32766, %ecx # imm = 0x7FFE
-; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
+; X86-NEXT:    decl %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl $32766, %eax # imm = 0x7FFE
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -351,24 +355,22 @@ entry:
 define i1 @is_posnormal_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_posnormal_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    decl %ecx
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl $32766, %ecx # imm = 0x7FFE
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    testl $32768, %eax # imm = 0x8000
+; X86-NEXT:    sete %dl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $32767, %edx # imm = 0x7FFF
-; X86-NEXT:    decl %edx
-; X86-NEXT:    movzwl %dx, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpl $32766, %edx # imm = 0x7FFE
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    setb %dl
-; X86-NEXT:    testl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    sete %cl
 ; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_posnormal_f80:
@@ -395,24 +397,22 @@ entry:
 define i1 @is_negnormal_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_negnormal_f80:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    decl %ecx
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl $32766, %ecx # imm = 0x7FFE
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    testl $32768, %eax # imm = 0x8000
+; X86-NEXT:    setne %dl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $32767, %edx # imm = 0x7FFF
-; X86-NEXT:    decl %edx
-; X86-NEXT:    movzwl %dx, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    cmpl $32766, %edx # imm = 0x7FFE
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    setb %dl
-; X86-NEXT:    testl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    setne %cl
 ; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_negnormal_f80:
@@ -440,19 +440,19 @@ define i1 @is_subnormal_f80(x86_fp80 %x) nounwind {
 ; X86-LABEL: is_subnormal_f80:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    addl $-1, %esi
-; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    sbbl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    sbbl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -516,20 +516,20 @@ define i1 @is_negsubnormal_f80(x86_fp80 %x) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    addl $-1, %edi
-; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl $-1, %esi
+; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    cmpl $-1, %edi
-; X86-NEXT:    sbbl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    sbbl $2147483647, %edi # imm = 0x7FFFFFFF
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    testl $32768, %eax # imm = 0x8000
 ; X86-NEXT:    setne %al
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 70062b245f36d..31c6a6989aca2 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -1222,13 +1222,13 @@ entry:
 define i1 @issubnormal_d(double %x) {
 ; X86-LABEL: issubnormal_d:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    cmpl $-1, %eax
-; X86-NEXT:    sbbl $1048575, %ecx # imm = 0xFFFFF
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    cmpl $-1, %ecx
+; X86-NEXT:    sbbl $1048575, %eax # imm = 0xFFFFF
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
@@ -1284,9 +1284,9 @@ define i1 @issignaling_d(double %x) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $2146435072, %ecx # imm = 0x7FF00000
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    cmpl $2146959360, %eax # imm = 0x7FF80000
@@ -1364,9 +1364,9 @@ define i1 @isnan_d_strictfp(double %x) strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $2146435072, %ecx # imm = 0x7FF00000
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    retl
@@ -1464,12 +1464,12 @@ define <2 x i1> @isnan_v2f(<2 x float> %x) {
 ; X86-LABEL: isnan_v2f:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setp %cl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
@@ -1493,12 +1493,12 @@ define <2 x i1> @isnot_nan_v2f(<2 x float> %x) {
 ; X86-LABEL: isnot_nan_v2f:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
 ; X86-NEXT:    setnp %cl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
@@ -1544,39 +1544,38 @@ entry:
 define <4 x i1> @isnan_v4f(<4 x float> %x) {
 ; X86-LABEL: isnan_v4f:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %dh
-; X86-NEXT:    shlb $2, %dh
+; X86-NEXT:    setp %dl
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %dl
-; X86-NEXT:    shlb $3, %dl
-; X86-NEXT:    orb %dh, %dl
+; X86-NEXT:    setp %cl
+; X86-NEXT:    shlb $3, %cl
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %dh
+; X86-NEXT:    setp %dl
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fucomp %st(0)
 ; X86-NEXT:    fnstsw %ax
 ; X86-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X86-NEXT:    sahf
-; X86-NEXT:    setp %al
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    orb %dh, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    movb %al, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    setp %ch
+; X86-NEXT:    addb %ch, %ch
+; X86-NEXT:    orb %dl, %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %ch, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: isnan_v4f:
@@ -1591,35 +1590,30 @@ entry:
 define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
 ; X86-LABEL: isnan_v4f_strictfp:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    cmpl $2139095041, %edx # imm = 0x7F800001
-; X86-NEXT:    setge %dh
-; X86-NEXT:    shlb $2, %dh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
 ; X86-NEXT:    setge %dl
-; X86-NEXT:    shlb $3, %dl
-; X86-NEXT:    orb %dh, %dl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
-; X86-NEXT:    setge %dh
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
 ; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
 ; X86-NEXT:    setge %cl
-; X86-NEXT:    addb %cl, %cl
-; X86-NEXT:    orb %dh, %cl
+; X86-NEXT:    shlb $3, %cl
 ; X86-NEXT:    orb %dl, %cl
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    cmpl $2139095041, %edx # imm = 0x7F800001
+; X86-NEXT:    setge %dl
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %ch
+; X86-NEXT:    addb %ch, %ch
+; X86-NEXT:    orb %dl, %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %ch, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: isnan_v4f_strictfp:
diff --git a/llvm/test/CodeGen/X86/isel-and.ll b/llvm/test/CodeGen/X86/isel-and.ll
index 3fda0e1d86391..304e83b1909c9 100644
--- a/llvm/test/CodeGen/X86/isel-and.ll
+++ b/llvm/test/CodeGen/X86/isel-and.ll
@@ -437,9 +437,9 @@ define i32 @and_imm16_i32(i32 %a) {
 define i64 @and_imm16_i64(i64 %a) {
 ; SDAG-X86-LABEL: and_imm16_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-5022, %eax # imm = 0xEC62
 ; SDAG-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: and_imm16_i64:
@@ -497,9 +497,9 @@ define i32 @and_imm32_i32(i32 %a) {
 define i64 @and_imm32_i64(i64 %a) {
 ; SDAG-X86-LABEL: and_imm32_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-125778, %eax # imm = 0xFFFE14AE
 ; SDAG-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: and_imm32_i64:
@@ -529,9 +529,9 @@ define i64 @and_imm32_i64(i64 %a) {
 define i64 @and_imm64_i64(i64 %a) {
 ; SDAG-X86-LABEL: and_imm64_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-1850691612, %eax # imm = 0x91B0AFE4
 ; SDAG-X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    andl $-2, %edx
 ; SDAG-X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/isel-fabs.ll b/llvm/test/CodeGen/X86/isel-fabs.ll
index c2d29248e49ba..5282a2ecad258 100644
--- a/llvm/test/CodeGen/X86/isel-fabs.ll
+++ b/llvm/test/CodeGen/X86/isel-fabs.ll
@@ -58,9 +58,9 @@ define double @test_double_abs(double %arg) nounwind {
 ;
 ; X86-LABEL: test_double_abs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: test_double_abs:
diff --git a/llvm/test/CodeGen/X86/isel-fneg.ll b/llvm/test/CodeGen/X86/isel-fneg.ll
index 77b3f263213a9..3aa2850aecfeb 100644
--- a/llvm/test/CodeGen/X86/isel-fneg.ll
+++ b/llvm/test/CodeGen/X86/isel-fneg.ll
@@ -86,14 +86,41 @@ define float @fneg_f32(float %x) nounwind {
 }
 
 define void @fneg_f64_mem(ptr %x, ptr %y) nounwind {
-; X86-LABEL: fneg_f64_mem:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    fldl (%ecx)
-; X86-NEXT:    fchs
-; X86-NEXT:    fstpl (%eax)
-; X86-NEXT:    retl
+; FASTISEL-X86-LABEL: fneg_f64_mem:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT:    fldl (%ecx)
+; FASTISEL-X86-NEXT:    fchs
+; FASTISEL-X86-NEXT:    fstpl (%eax)
+; FASTISEL-X86-NEXT:    retl
+;
+; SDAG-X86-LABEL: fneg_f64_mem:
+; SDAG-X86:       # %bb.0:
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    fldl (%eax)
+; SDAG-X86-NEXT:    fchs
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    fstpl (%eax)
+; SDAG-X86-NEXT:    retl
+;
+; FASTISEL-SSE-X86-LABEL: fneg_f64_mem:
+; FASTISEL-SSE-X86:       # %bb.0:
+; FASTISEL-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-SSE-X86-NEXT:    fldl (%ecx)
+; FASTISEL-SSE-X86-NEXT:    fchs
+; FASTISEL-SSE-X86-NEXT:    fstpl (%eax)
+; FASTISEL-SSE-X86-NEXT:    retl
+;
+; SDAG-SSE-X86-LABEL: fneg_f64_mem:
+; SDAG-SSE-X86:       # %bb.0:
+; SDAG-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-SSE-X86-NEXT:    fldl (%eax)
+; SDAG-SSE-X86-NEXT:    fchs
+; SDAG-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-SSE-X86-NEXT:    fstpl (%eax)
+; SDAG-SSE-X86-NEXT:    retl
 ;
 ; FASTISEL-SSE-X64-LABEL: fneg_f64_mem:
 ; FASTISEL-SSE-X64:       # %bb.0:
@@ -137,10 +164,10 @@ define void @fneg_f32_mem(ptr %x, ptr %y) nounwind {
 ; SDAG-X86-LABEL: fneg_f32_mem:
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG-X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
-; SDAG-X86-NEXT:    xorl (%ecx), %edx
-; SDAG-X86-NEXT:    movl %edx, (%eax)
+; SDAG-X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; SDAG-X86-NEXT:    xorl (%eax), %ecx
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %ecx, (%eax)
 ; SDAG-X86-NEXT:    retl
 ;
 ; FASTISEL-SSE-X86-LABEL: fneg_f32_mem:
@@ -155,10 +182,10 @@ define void @fneg_f32_mem(ptr %x, ptr %y) nounwind {
 ; SDAG-SSE-X86-LABEL: fneg_f32_mem:
 ; SDAG-SSE-X86:       # %bb.0:
 ; SDAG-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG-SSE-X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
-; SDAG-SSE-X86-NEXT:    xorl (%ecx), %edx
-; SDAG-SSE-X86-NEXT:    movl %edx, (%eax)
+; SDAG-SSE-X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; SDAG-SSE-X86-NEXT:    xorl (%eax), %ecx
+; SDAG-SSE-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-SSE-X86-NEXT:    movl %ecx, (%eax)
 ; SDAG-SSE-X86-NEXT:    retl
 ;
 ; FASTISEL-SSE-X64-LABEL: fneg_f32_mem:
diff --git a/llvm/test/CodeGen/X86/isel-llvm.atan2.ll b/llvm/test/CodeGen/X86/isel-llvm.atan2.ll
index 2f43cd001b122..958457e86b869 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.atan2.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.atan2.ll
@@ -9,8 +9,8 @@ define float @use_atan2f32(float %a, float %b) nounwind {
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    subl $12, %esp
 ; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstps (%esp)
 ; SDAG-X86-NEXT:    calll atan2f
 ; SDAG-X86-NEXT:    addl $12, %esp
@@ -46,8 +46,8 @@ define double @use_atan2f64(double %a, double %b) nounwind {
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    subl $28, %esp
 ; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstpl (%esp)
 ; SDAG-X86-NEXT:    calll atan2
 ; SDAG-X86-NEXT:    addl $28, %esp
@@ -93,8 +93,8 @@ define x86_fp80 @use_atan2f80(x86_fp80 %a, x86_fp80 %b) nounwind {
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    subl $28, %esp
 ; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstpt (%esp)
 ; SDAG-X86-NEXT:    calll atan2l
 ; SDAG-X86-NEXT:    addl $28, %esp
diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 8576f8f149e9a..cdb5b2377112d 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
@@ -11,20 +11,20 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
 
 define { float, float } @test_sincos_f32(float %Val) nounwind {
-; X86-LABEL: test_sincos_f32:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp)
-; X86-NEXT:    calll sincosf
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    retl
+; FASTISEL-X86-LABEL: test_sincos_f32:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    subl $28, %esp
+; FASTISEL-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    fstps (%esp)
+; FASTISEL-X86-NEXT:    calll sincosf
+; FASTISEL-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    addl $28, %esp
+; FASTISEL-X86-NEXT:    retl
 ;
 ; X64-LABEL: test_sincos_f32:
 ; X64:       # %bb.0:
@@ -37,6 +37,21 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
 ;
+; SDAG-X86-LABEL: test_sincos_f32:
+; SDAG-X86:       # %bb.0:
+; SDAG-X86-NEXT:    subl $28, %esp
+; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fstps (%esp)
+; SDAG-X86-NEXT:    calll sincosf
+; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    addl $28, %esp
+; SDAG-X86-NEXT:    retl
+;
 ; MACOS-SINCOS-STRET-LABEL: test_sincos_f32:
 ; MACOS-SINCOS-STRET:       ## %bb.0:
 ; MACOS-SINCOS-STRET-NEXT:    pushq %rax
@@ -93,20 +108,20 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
 }
 
 define { double, double } @test_sincos_f64(double %Val) nounwind  {
-; X86-LABEL: test_sincos_f64:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    fstpl (%esp)
-; X86-NEXT:    calll sincos
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    retl
+; FASTISEL-X86-LABEL: test_sincos_f64:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    subl $44, %esp
+; FASTISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    fstpl (%esp)
+; FASTISEL-X86-NEXT:    calll sincos
+; FASTISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    addl $44, %esp
+; FASTISEL-X86-NEXT:    retl
 ;
 ; X64-LABEL: test_sincos_f64:
 ; X64:       # %bb.0:
@@ -119,6 +134,21 @@ define { double, double } @test_sincos_f64(double %Val) nounwind  {
 ; X64-NEXT:    addq $24, %rsp
 ; X64-NEXT:    retq
 ;
+; SDAG-X86-LABEL: test_sincos_f64:
+; SDAG-X86:       # %bb.0:
+; SDAG-X86-NEXT:    subl $44, %esp
+; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fstpl (%esp)
+; SDAG-X86-NEXT:    calll sincos
+; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    addl $44, %esp
+; SDAG-X86-NEXT:    retl
+;
 ; MACOS-SINCOS-STRET-LABEL: test_sincos_f64:
 ; MACOS-SINCOS-STRET:       ## %bb.0:
 ; MACOS-SINCOS-STRET-NEXT:    pushq %rax
@@ -144,11 +174,11 @@ define { double, double } @test_sincos_f64(double %Val) nounwind  {
 ; GISEL-X86-LABEL: test_sincos_f64:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $44, %esp
-; GISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fstpl (%esp)
 ; GISEL-X86-NEXT:    calll sincos
 ; GISEL-X86-NEXT:    fldl {{[0-9]+}}(%esp)
@@ -173,20 +203,20 @@ define { double, double } @test_sincos_f64(double %Val) nounwind  {
 }
 
 define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
-; X86-LABEL: test_sincos_f80:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    fstpt (%esp)
-; X86-NEXT:    calll sincosl
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    retl
+; FASTISEL-X86-LABEL: test_sincos_f80:
+; FASTISEL-X86:       # %bb.0:
+; FASTISEL-X86-NEXT:    subl $44, %esp
+; FASTISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    fstpt (%esp)
+; FASTISEL-X86-NEXT:    calll sincosl
+; FASTISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT:    addl $44, %esp
+; FASTISEL-X86-NEXT:    retl
 ;
 ; X64-LABEL: test_sincos_f80:
 ; X64:       # %bb.0:
@@ -201,6 +231,21 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
 ; X64-NEXT:    addq $56, %rsp
 ; X64-NEXT:    retq
 ;
+; SDAG-X86-LABEL: test_sincos_f80:
+; SDAG-X86:       # %bb.0:
+; SDAG-X86-NEXT:    subl $44, %esp
+; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fstpt (%esp)
+; SDAG-X86-NEXT:    calll sincosl
+; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    addl $44, %esp
+; SDAG-X86-NEXT:    retl
+;
 ; MACOS-SINCOS-STRET-LABEL: test_sincos_f80:
 ; MACOS-SINCOS-STRET:       ## %bb.0:
 ; MACOS-SINCOS-STRET-NEXT:    subq $40, %rsp
@@ -329,17 +374,15 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %
 ; SDAG-X86-NEXT:    pushl %edi
 ; SDAG-X86-NEXT:    pushl %esi
 ; SDAG-X86-NEXT:    subl $20, %esp
-; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; SDAG-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; SDAG-X86-NEXT:    movl %edi, (%esp)
 ; SDAG-X86-NEXT:    calll foo at PLT
 ; SDAG-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstps (%esp)
 ; SDAG-X86-NEXT:    calll sincosf
 ; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
@@ -486,3 +529,5 @@ entry:
   store float %cos, ptr %b, align 4
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/isel-or.ll b/llvm/test/CodeGen/X86/isel-or.ll
index 8b191c007675c..eaef14cd247a3 100644
--- a/llvm/test/CodeGen/X86/isel-or.ll
+++ b/llvm/test/CodeGen/X86/isel-or.ll
@@ -515,9 +515,9 @@ define i64 @or_imm32_i64(i64 %a) {
 define i64 @or_imm64_i64(i64 %a) {
 ; SDAG-X86-LABEL: or_imm64_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-1850691612, %eax # imm = 0x91B0AFE4
 ; SDAG-X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    orl $-2, %edx
 ; SDAG-X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/isel-sdiv.ll b/llvm/test/CodeGen/X86/isel-sdiv.ll
index 1aca8d1035664..398ff72a02d54 100644
--- a/llvm/test/CodeGen/X86/isel-sdiv.ll
+++ b/llvm/test/CodeGen/X86/isel-sdiv.ll
@@ -84,17 +84,6 @@ define i64 @test_sdiv_i64(i64 %arg1, i64 %arg2) nounwind {
 ; X64-NEXT:    idivq %rsi
 ; X64-NEXT:    retq
 ;
-; DAG-X86-LABEL: test_sdiv_i64:
-; DAG-X86:       # %bb.0:
-; DAG-X86-NEXT:    subl $12, %esp
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    calll __divdi3
-; DAG-X86-NEXT:    addl $28, %esp
-; DAG-X86-NEXT:    retl
-;
 ; GISEL-X86-LABEL: test_sdiv_i64:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    pushl %esi
diff --git a/llvm/test/CodeGen/X86/isel-select-cmov.ll b/llvm/test/CodeGen/X86/isel-select-cmov.ll
index d013ad2c7fbff..4393b9dec64b4 100644
--- a/llvm/test/CodeGen/X86/isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-cmov.ll
@@ -60,9 +60,9 @@ define zeroext i8 @select_cmov_i8(i1 zeroext %cond, i8 zeroext %a, i8 zeroext %b
 ;
 ; SDAG-X86-CMOV-LABEL: select_cmov_i8:
 ; SDAG-X86-CMOV:       ## %bb.0:
-; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    cmovnel %eax, %ecx
 ; SDAG-X86-CMOV-NEXT:    movzbl (%ecx), %eax
 ; SDAG-X86-CMOV-NEXT:    retl
@@ -168,9 +168,9 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex
 ;
 ; SDAG-X86-CMOV-LABEL: select_cmov_i16:
 ; SDAG-X86-CMOV:       ## %bb.0:
-; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    cmovnel %eax, %ecx
 ; SDAG-X86-CMOV-NEXT:    movzwl (%ecx), %eax
 ; SDAG-X86-CMOV-NEXT:    retl
@@ -373,9 +373,9 @@ define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; SDAG-X86-CMOV-LABEL: select_cmov_i32:
 ; SDAG-X86-CMOV:       ## %bb.0:
-; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    cmovnel %eax, %ecx
 ; SDAG-X86-CMOV-NEXT:    movl (%ecx), %eax
 ; SDAG-X86-CMOV-NEXT:    retl
@@ -563,9 +563,9 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
 ;
 ; SDAG-X86-CMOV-LABEL: select_cmov_i64:
 ; SDAG-X86-CMOV:       ## %bb.0:
-; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-CMOV-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-CMOV-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; SDAG-X86-CMOV-NEXT:    cmovnel %eax, %ecx
 ; SDAG-X86-CMOV-NEXT:    movl (%ecx), %eax
 ; SDAG-X86-CMOV-NEXT:    movl 4(%ecx), %edx
diff --git a/llvm/test/CodeGen/X86/isel-select-fcmov.ll b/llvm/test/CodeGen/X86/isel-select-fcmov.ll
index cb441b860bb56..b54c037f347b0 100644
--- a/llvm/test/CodeGen/X86/isel-select-fcmov.ll
+++ b/llvm/test/CodeGen/X86/isel-select-fcmov.ll
@@ -13,8 +13,8 @@ define x86_fp80 @cmove_cmp(x86_fp80 %a, x86_fp80 %b, i32 %c) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:    fcmove %st(1), %st
 ; X86-NEXT:    fstp %st(1)
@@ -119,18 +119,6 @@ define x86_fp80 @cmove_arg(x86_fp80 %a, x86_fp80 %b, i1 %test) {
 }
 
 define x86_fp80 @cmove_load(x86_fp80 %a, x86_fp80 %b, ptr %p) {
-; X86-LABEL: cmove_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fadd %st(1), %st
-; X86-NEXT:    cmpb $0, (%eax)
-; X86-NEXT:    fxch %st(1)
-; X86-NEXT:    fcmovne %st(1), %st
-; X86-NEXT:    fstp %st(1)
-; X86-NEXT:    retl
-;
 ; X86-GISEL-LABEL: cmove_load:
 ; X86-GISEL:       # %bb.0:
 ; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/isel-sint-to-fp-x87.ll b/llvm/test/CodeGen/X86/isel-sint-to-fp-x87.ll
index 98fa3c9541faa..e3a2177c82c50 100644
--- a/llvm/test/CodeGen/X86/isel-sint-to-fp-x87.ll
+++ b/llvm/test/CodeGen/X86/isel-sint-to-fp-x87.ll
@@ -30,10 +30,10 @@ define void @test_int8_to_float(i8 %x, ptr %p) nounwind {
 ; SDAG-X86:       # %bb.0: # %entry
 ; SDAG-X86-NEXT:    pushl %eax
 ; SDAG-X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SDAG-X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-NEXT:    filds {{[0-9]+}}(%esp)
-; SDAG-X86-NEXT:    fstps (%ecx)
+; SDAG-X86-NEXT:    fstps (%eax)
 ; SDAG-X86-NEXT:    popl %eax
 ; SDAG-X86-NEXT:    retl
 ;
@@ -66,9 +66,9 @@ define void @test_int16_to_float(i16 %x, ptr %p) nounwind {
 ; SDAG-X86-LABEL: test_int16_to_float:
 ; SDAG-X86:       # %bb.0: # %entry
 ; SDAG-X86-NEXT:    pushl %eax
+; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; SDAG-X86-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    filds {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstps (%eax)
 ; SDAG-X86-NEXT:    popl %eax
@@ -102,8 +102,8 @@ define void @test_int32_to_float(i32 %x, ptr %p) nounwind {
 ; SDAG-X86:       # %bb.0: # %entry
 ; SDAG-X86-NEXT:    pushl %eax
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG-X86-NEXT:    movl %ecx, (%esp)
+; SDAG-X86-NEXT:    movl %eax, (%esp)
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-NEXT:    fildl (%esp)
 ; SDAG-X86-NEXT:    fstps (%eax)
 ; SDAG-X86-NEXT:    popl %eax
@@ -135,8 +135,8 @@ define void @test_int64_to_float(i64 %x, ptr %p) nounwind {
 ;
 ; X86-LABEL: test_int64_to_float:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    fstps (%eax)
 ; X86-NEXT:    retl
 entry:
@@ -260,10 +260,10 @@ define void @test_int8to_double(i8 %x, ptr %p) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
-; X86-NEXT:    fstpl (%ecx)
+; X86-NEXT:    fstpl (%eax)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    retl
 entry:
@@ -283,9 +283,9 @@ define void @test_int16_to_double(i16 %x, ptr %p) nounwind {
 ; X86-LABEL: test_int16_to_double:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    filds {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpl (%eax)
 ; X86-NEXT:    popl %eax
@@ -308,8 +308,8 @@ define void @test_int32_to_double(i32 %x, ptr %p) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    fildl (%esp)
 ; X86-NEXT:    fstpl (%eax)
 ; X86-NEXT:    popl %eax
@@ -330,8 +330,8 @@ define void @test_int64_to_double(i64 %x, ptr %p) nounwind {
 ;
 ; X86-LABEL: test_int64_to_double:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    fildll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    fstpl (%eax)
 ; X86-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/isel-srem.ll b/llvm/test/CodeGen/X86/isel-srem.ll
index 1dabf4175c852..01eee20977540 100644
--- a/llvm/test/CodeGen/X86/isel-srem.ll
+++ b/llvm/test/CodeGen/X86/isel-srem.ll
@@ -118,16 +118,33 @@ define i64 @test_srem_i64(i64 %arg1, i64 %arg2) nounwind {
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    retq
 ;
-; DAG-X86-LABEL: test_srem_i64:
-; DAG-X86:       # %bb.0:
-; DAG-X86-NEXT:    subl $12, %esp
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    calll __moddi3
-; DAG-X86-NEXT:    addl $28, %esp
-; DAG-X86-NEXT:    retl
+; SDAG-X86-LABEL: test_srem_i64:
+; SDAG-X86:       # %bb.0:
+; SDAG-X86-NEXT:    pushl %esi
+; SDAG-X86-NEXT:    subl $8, %esp
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SDAG-X86-NEXT:    pushl %ecx
+; SDAG-X86-NEXT:    pushl %eax
+; SDAG-X86-NEXT:    pushl %esi
+; SDAG-X86-NEXT:    pushl %edx
+; SDAG-X86-NEXT:    calll __moddi3
+; SDAG-X86-NEXT:    addl $24, %esp
+; SDAG-X86-NEXT:    popl %esi
+; SDAG-X86-NEXT:    retl
+;
+; FAST-X86-LABEL: test_srem_i64:
+; FAST-X86:       # %bb.0:
+; FAST-X86-NEXT:    subl $12, %esp
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    calll __moddi3
+; FAST-X86-NEXT:    addl $28, %esp
+; FAST-X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: test_srem_i64:
 ; GISEL-X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/isel-udiv.ll b/llvm/test/CodeGen/X86/isel-udiv.ll
index b123b3c7780fa..2a6da26ae32d5 100644
--- a/llvm/test/CodeGen/X86/isel-udiv.ll
+++ b/llvm/test/CodeGen/X86/isel-udiv.ll
@@ -84,17 +84,6 @@ define i64 @test_udiv_i64(i64 %arg1, i64 %arg2) nounwind {
 ; X64-NEXT:    divq %rsi
 ; X64-NEXT:    retq
 ;
-; DAG-X86-LABEL: test_udiv_i64:
-; DAG-X86:       # %bb.0:
-; DAG-X86-NEXT:    subl $12, %esp
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    calll __udivdi3
-; DAG-X86-NEXT:    addl $28, %esp
-; DAG-X86-NEXT:    retl
-;
 ; GISEL-X86-LABEL: test_udiv_i64:
 ; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    pushl %esi
diff --git a/llvm/test/CodeGen/X86/isel-urem.ll b/llvm/test/CodeGen/X86/isel-urem.ll
index 386f08151ad9c..8d97ea1df8c0b 100644
--- a/llvm/test/CodeGen/X86/isel-urem.ll
+++ b/llvm/test/CodeGen/X86/isel-urem.ll
@@ -118,16 +118,33 @@ define i64 @test_urem_i64(i64 %arg1, i64 %arg2) nounwind {
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    retq
 ;
-; DAG-X86-LABEL: test_urem_i64:
-; DAG-X86:       # %bb.0:
-; DAG-X86-NEXT:    subl $12, %esp
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; DAG-X86-NEXT:    calll __umoddi3
-; DAG-X86-NEXT:    addl $28, %esp
-; DAG-X86-NEXT:    retl
+; SDAG-X86-LABEL: test_urem_i64:
+; SDAG-X86:       # %bb.0:
+; SDAG-X86-NEXT:    pushl %esi
+; SDAG-X86-NEXT:    subl $8, %esp
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; SDAG-X86-NEXT:    pushl %ecx
+; SDAG-X86-NEXT:    pushl %eax
+; SDAG-X86-NEXT:    pushl %esi
+; SDAG-X86-NEXT:    pushl %edx
+; SDAG-X86-NEXT:    calll __umoddi3
+; SDAG-X86-NEXT:    addl $24, %esp
+; SDAG-X86-NEXT:    popl %esi
+; SDAG-X86-NEXT:    retl
+;
+; FAST-X86-LABEL: test_urem_i64:
+; FAST-X86:       # %bb.0:
+; FAST-X86-NEXT:    subl $12, %esp
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; FAST-X86-NEXT:    calll __umoddi3
+; FAST-X86-NEXT:    addl $28, %esp
+; FAST-X86-NEXT:    retl
 ;
 ; GISEL-X86-LABEL: test_urem_i64:
 ; GISEL-X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/isel-x87.ll b/llvm/test/CodeGen/X86/isel-x87.ll
index 492faaa19cd66..d7a282d9390e0 100644
--- a/llvm/test/CodeGen/X86/isel-x87.ll
+++ b/llvm/test/CodeGen/X86/isel-x87.ll
@@ -86,15 +86,15 @@ define void @f1(ptr %a, ptr %b) nounwind {
 ; GISEL_X86-NEXT:    fstpt (%eax)
 ; GISEL_X86-NEXT:    retl
 ;
-; SDAG_X86-LABEL: f1:
-; SDAG_X86:       # %bb.0:
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG_X86-NEXT:    fldt (%ecx)
-; SDAG_X86-NEXT:    fldt (%eax)
-; SDAG_X86-NEXT:    fsubrp %st, %st(1)
-; SDAG_X86-NEXT:    fstpt (%ecx)
-; SDAG_X86-NEXT:    retl
+; FAST_X86-LABEL: f1:
+; FAST_X86:       # %bb.0:
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FAST_X86-NEXT:    fldt (%eax)
+; FAST_X86-NEXT:    fldt (%ecx)
+; FAST_X86-NEXT:    fsubp %st, %st(1)
+; FAST_X86-NEXT:    fstpt (%ecx)
+; FAST_X86-NEXT:    retl
 ;
 ; CHECK-64-LABEL: f1:
 ; CHECK-64:       # %bb.0:
@@ -121,15 +121,15 @@ define void @f2(ptr %a, ptr %b) nounwind {
 ; GISEL_X86-NEXT:    fstpt (%eax)
 ; GISEL_X86-NEXT:    retl
 ;
-; SDAG_X86-LABEL: f2:
-; SDAG_X86:       # %bb.0:
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG_X86-NEXT:    fldt (%ecx)
-; SDAG_X86-NEXT:    fldt (%eax)
-; SDAG_X86-NEXT:    fmulp %st, %st(1)
-; SDAG_X86-NEXT:    fstpt (%ecx)
-; SDAG_X86-NEXT:    retl
+; FAST_X86-LABEL: f2:
+; FAST_X86:       # %bb.0:
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FAST_X86-NEXT:    fldt (%eax)
+; FAST_X86-NEXT:    fldt (%ecx)
+; FAST_X86-NEXT:    fmulp %st, %st(1)
+; FAST_X86-NEXT:    fstpt (%ecx)
+; FAST_X86-NEXT:    retl
 ;
 ; CHECK-64-LABEL: f2:
 ; CHECK-64:       # %bb.0:
@@ -156,15 +156,15 @@ define void @f3(ptr %a, ptr %b) nounwind {
 ; GISEL_X86-NEXT:    fstpt (%eax)
 ; GISEL_X86-NEXT:    retl
 ;
-; SDAG_X86-LABEL: f3:
-; SDAG_X86:       # %bb.0:
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG_X86-NEXT:    fldt (%ecx)
-; SDAG_X86-NEXT:    fldt (%eax)
-; SDAG_X86-NEXT:    fdivrp %st, %st(1)
-; SDAG_X86-NEXT:    fstpt (%ecx)
-; SDAG_X86-NEXT:    retl
+; FAST_X86-LABEL: f3:
+; FAST_X86:       # %bb.0:
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FAST_X86-NEXT:    fldt (%eax)
+; FAST_X86-NEXT:    fldt (%ecx)
+; FAST_X86-NEXT:    fdivp %st, %st(1)
+; FAST_X86-NEXT:    fstpt (%ecx)
+; FAST_X86-NEXT:    retl
 ;
 ; CHECK-64-LABEL: f3:
 ; CHECK-64:       # %bb.0:
@@ -190,14 +190,14 @@ define void @f6(ptr %a, ptr %b) nounwind {
 ; GISEL_X86-NEXT:    fstps (%ecx)
 ; GISEL_X86-NEXT:    retl
 ;
-; SDAG_X86-LABEL: f6:
-; SDAG_X86:       # %bb.0:
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SDAG_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SDAG_X86-NEXT:    flds (%ecx)
-; SDAG_X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}
-; SDAG_X86-NEXT:    fstps (%eax)
-; SDAG_X86-NEXT:    retl
+; FAST_X86-LABEL: f6:
+; FAST_X86:       # %bb.0:
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FAST_X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FAST_X86-NEXT:    flds (%ecx)
+; FAST_X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}
+; FAST_X86-NEXT:    fstps (%eax)
+; FAST_X86-NEXT:    retl
 ;
 ; GISEL_X64-LABEL: f6:
 ; GISEL_X64:       # %bb.0:
@@ -220,4 +220,3 @@ define void @f6(ptr %a, ptr %b) nounwind {
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-32: {{.*}}
 ; FAST_X64: {{.*}}
-; FAST_X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/isel-xor.ll b/llvm/test/CodeGen/X86/isel-xor.ll
index 0549a6436a1a4..3eeb3e2d9653d 100644
--- a/llvm/test/CodeGen/X86/isel-xor.ll
+++ b/llvm/test/CodeGen/X86/isel-xor.ll
@@ -425,9 +425,9 @@ define i32 @xor_imm16_i32(i32 %a) {
 define i64 @xor_imm16_i64(i64 %a) {
 ; SDAG-X86-LABEL: xor_imm16_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-5022, %eax # imm = 0xEC62
 ; SDAG-X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    notl %edx
 ; SDAG-X86-NEXT:    retl
 ;
@@ -487,9 +487,9 @@ define i32 @xor_imm32_i32(i32 %a) {
 define i64 @xor_imm32_i64(i64 %a) {
 ; SDAG-X86-LABEL: xor_imm32_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-125778, %eax # imm = 0xFFFE14AE
 ; SDAG-X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    notl %edx
 ; SDAG-X86-NEXT:    retl
 ;
@@ -521,9 +521,9 @@ define i64 @xor_imm32_i64(i64 %a) {
 define i64 @xor_imm64_i64(i64 %a) {
 ; SDAG-X86-LABEL: xor_imm64_i64:
 ; SDAG-X86:       # %bb.0:
-; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    movl $-1850691612, %eax # imm = 0x91B0AFE4
 ; SDAG-X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; SDAG-X86-NEXT:    xorl $-2, %edx
 ; SDAG-X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index d28a93ec3d77c..31d9983ef1e58 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -47,8 +47,8 @@ define i32 @func_g(i32 %a, i32 %b) nounwind {
 define i32 @func_h(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: func_h:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    cmovlel %ecx, %eax
 ; CHECK-NEXT:    retl
@@ -61,8 +61,8 @@ define i32 @func_h(i32 %a, i32 %b) nounwind {
 define i32 @func_i(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: func_i:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    cmovlel %ecx, %eax
 ; CHECK-NEXT:    retl
@@ -75,8 +75,8 @@ define i32 @func_i(i32 %a, i32 %b) nounwind {
 define i32 @func_j(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: func_j:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    cmovbel %ecx, %eax
 ; CHECK-NEXT:    retl
@@ -89,8 +89,8 @@ define i32 @func_j(i32 %a, i32 %b) nounwind {
 define i32 @func_k(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: func_k:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    cmovbel %ecx, %eax
 ; CHECK-NEXT:    retl
@@ -185,8 +185,8 @@ if.else:
 define i32 @func_l4(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: func_l4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    cmovll %ecx, %eax
 ; CHECK-NEXT:    retl
@@ -302,8 +302,8 @@ define i32 @func_p(i32 %a, i32 %b) nounwind {
 define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) {
 ; CHECK-LABEL: func_q:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    sbbl %ecx, %ecx
 ; CHECK-NEXT:    negl %eax
diff --git a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll
index f01411c64acb0..ebb8a9f91e97d 100644
--- a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll
@@ -44,16 +44,14 @@ define i32 @test_encodekey128_u32(i32 %htype, <2 x i64> %key, ptr nocapture %h0,
 ;
 ; X86-LABEL: test_encodekey128_u32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    encodekey128 %eax, %eax
-; X86-NEXT:    vmovaps %xmm0, (%esi)
-; X86-NEXT:    vmovaps %xmm1, (%edx)
+; X86-NEXT:    vmovaps %xmm0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovaps %xmm1, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovaps %xmm2, (%ecx)
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %htype, <2 x i64> %key)
@@ -79,20 +77,16 @@ define i32 @test_encodekey256_u32(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_
 ;
 ; X86-LABEL: test_encodekey256_u32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    encodekey256 %eax, %eax
-; X86-NEXT:    vmovaps %xmm0, (%edi)
-; X86-NEXT:    vmovaps %xmm1, (%esi)
-; X86-NEXT:    vmovaps %xmm2, (%edx)
+; X86-NEXT:    vmovaps %xmm0, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovaps %xmm1, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovaps %xmm2, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovaps %xmm3, (%ecx)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi)
@@ -118,11 +112,11 @@ define i8 @test_mm_aesenc128kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
 ;
 ; X86-LABEL: test_mm_aesenc128kl_u8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    aesenc128kl (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    aesenc128kl (%ecx), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    sete %al
-; X86-NEXT:    vmovaps %xmm0, (%ecx)
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %data, ptr %h)
@@ -142,11 +136,11 @@ define i8 @test_mm_aesdec128kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
 ;
 ; X86-LABEL: test_mm_aesdec128kl_u8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    aesdec128kl (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    aesdec128kl (%ecx), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    sete %al
-; X86-NEXT:    vmovaps %xmm0, (%ecx)
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %data, ptr %h)
@@ -166,11 +160,11 @@ define i8 @test_mm_aesenc256kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
 ;
 ; X86-LABEL: test_mm_aesenc256kl_u8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    aesenc256kl (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    aesenc256kl (%ecx), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    sete %al
-; X86-NEXT:    vmovaps %xmm0, (%ecx)
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, ptr %h)
@@ -190,11 +184,11 @@ define i8 @test_mm_aesdec256kl_u8(<2 x i64> %data, ptr %h, ptr %out) {
 ;
 ; X86-LABEL: test_mm_aesdec256kl_u8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    aesdec256kl (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    aesdec256kl (%ecx), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    sete %al
-; X86-NEXT:    vmovaps %xmm0, (%ecx)
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %data, ptr %h)
@@ -230,14 +224,14 @@ define i8 @test_mm_aesencwide128kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 104(%ebp), %eax
 ; X86-NEXT:    vmovaps 24(%ebp), %xmm3
 ; X86-NEXT:    vmovaps 40(%ebp), %xmm4
 ; X86-NEXT:    vmovaps 56(%ebp), %xmm5
 ; X86-NEXT:    vmovaps 72(%ebp), %xmm6
 ; X86-NEXT:    vmovaps 88(%ebp), %xmm7
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    aesencwide128kl (%eax)
-; X86-NEXT:    movl 104(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    aesencwide128kl (%ecx)
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    movl 108(%ebp), %eax
 ; X86-NEXT:    vmovaps %xmm1, (%eax)
@@ -305,14 +299,14 @@ define i8 @test_mm_aesdecwide128kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 104(%ebp), %eax
 ; X86-NEXT:    vmovaps 24(%ebp), %xmm3
 ; X86-NEXT:    vmovaps 40(%ebp), %xmm4
 ; X86-NEXT:    vmovaps 56(%ebp), %xmm5
 ; X86-NEXT:    vmovaps 72(%ebp), %xmm6
 ; X86-NEXT:    vmovaps 88(%ebp), %xmm7
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    aesdecwide128kl (%eax)
-; X86-NEXT:    movl 104(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    aesdecwide128kl (%ecx)
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    movl 108(%ebp), %eax
 ; X86-NEXT:    vmovaps %xmm1, (%eax)
@@ -380,14 +374,14 @@ define i8 @test_mm_aesencwide256kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 104(%ebp), %eax
 ; X86-NEXT:    vmovaps 24(%ebp), %xmm3
 ; X86-NEXT:    vmovaps 40(%ebp), %xmm4
 ; X86-NEXT:    vmovaps 56(%ebp), %xmm5
 ; X86-NEXT:    vmovaps 72(%ebp), %xmm6
 ; X86-NEXT:    vmovaps 88(%ebp), %xmm7
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    aesencwide256kl (%eax)
-; X86-NEXT:    movl 104(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    aesencwide256kl (%ecx)
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    movl 108(%ebp), %eax
 ; X86-NEXT:    vmovaps %xmm1, (%eax)
@@ -455,14 +449,14 @@ define i8 @test_mm_aesdecwide256kl_u8(ptr %p, <2 x i64> %v0, <2 x i64> %v1, <2 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 104(%ebp), %eax
 ; X86-NEXT:    vmovaps 24(%ebp), %xmm3
 ; X86-NEXT:    vmovaps 40(%ebp), %xmm4
 ; X86-NEXT:    vmovaps 56(%ebp), %xmm5
 ; X86-NEXT:    vmovaps 72(%ebp), %xmm6
 ; X86-NEXT:    vmovaps 88(%ebp), %xmm7
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    aesdecwide256kl (%eax)
-; X86-NEXT:    movl 104(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    aesdecwide256kl (%ecx)
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    movl 108(%ebp), %eax
 ; X86-NEXT:    vmovaps %xmm1, (%eax)
@@ -518,10 +512,10 @@ define i8 @test_mm_aesenc256kl_u8_global(<2 x i64> %data, ptr %out) {
 ;
 ; X86-LABEL: test_mm_aesenc256kl_u8_global:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    aesenc256kl foo, %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    sete %al
-; X86-NEXT:    vmovaps %xmm0, (%ecx)
 ; X86-NEXT:    retl
 entry:
   %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, ptr @foo)
diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll
index 261908fafc06e..f4417ef5252af 100644
--- a/llvm/test/CodeGen/X86/known-bits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-bits-vector.ll
@@ -46,8 +46,8 @@ define <4 x float> @knownbits_insert_uitofp(<4 x i32> %a0, i16 %a1, i16 %a2) nou
 ; X86-LABEL: knownbits_insert_uitofp:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
@@ -592,11 +592,11 @@ define <4 x float> @knownbits_and_select_shuffle_uitofp(<4 x i32> %a0, <4 x i32>
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovaps 8(%ebp), %xmm3
-; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
-; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm1
+; X86-NEXT:    vmovaps 8(%ebp), %xmm2
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    movl %ebp, %esp
@@ -628,11 +628,11 @@ define <4 x float> @knownbits_lshr_and_select_shuffle_uitofp(<4 x i32> %a0, <4 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovaps 8(%ebp), %xmm3
-; X86-NEXT:    vpsrld $5, %xmm2, %xmm2
-; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3
 ; X86-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X86-NEXT:    vpsrld $5, %xmm2, %xmm1
+; X86-NEXT:    vmovaps 8(%ebp), %xmm2
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
 ; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index b29ccaf9981bc..9da198ee360e7 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -7,11 +7,11 @@ define void @knownbits_zext_in_reg(ptr) nounwind {
 ; X86:       # %bb.0: # %BB
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl (%eax), %ecx
-; X86-NEXT:    imull $101, %ecx, %eax
-; X86-NEXT:    shrl $14, %eax
-; X86-NEXT:    imull $177, %ecx, %edx
+; X86-NEXT:    movzbl (%eax), %eax
+; X86-NEXT:    imull $177, %eax, %edx
 ; X86-NEXT:    shrl $14, %edx
+; X86-NEXT:    imull $101, %eax, %eax
+; X86-NEXT:    shrl $14, %eax
 ; X86-NEXT:    movzbl %al, %ecx
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    .p2align 4
@@ -88,26 +88,24 @@ define i32 @knownbits_mask_add_lshr(i32 %a0, i32 %a1) nounwind {
 define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: knownbits_mask_addc_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $-1024, %ecx # imm = 0xFC00
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $-1024, %esi # imm = 0xFC00
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    shldl $22, %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    shrdl $10, %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    shldl $22, %esi, %edx
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:    movl $0, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: knownbits_mask_addc_shl:
@@ -136,8 +134,8 @@ define {i32, i1} @knownbits_uaddo_saddo(i64 %a0, i64 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setb %al
-; X86-NEXT:    seto %dl
+; X86-NEXT:    seto %al
+; X86-NEXT:    setb %dl
 ; X86-NEXT:    orb %al, %dl
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
@@ -173,8 +171,8 @@ define {i32, i1} @knownbits_usubo_ssubo(i64 %a0, i64 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setb %al
-; X86-NEXT:    seto %dl
+; X86-NEXT:    seto %al
+; X86-NEXT:    setb %dl
 ; X86-NEXT:    orb %al, %dl
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 37c10219b471d..21804bf869ccf 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -27,8 +27,8 @@ define i32 @or_known_nonzero(i32 %x) {
 define i32 @or_known_nonzero_vec(<4 x i32> %x, ptr %p) {
 ; X86-LABEL: or_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -53,9 +53,9 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_maybe_zero:
@@ -74,8 +74,8 @@ define i32 @select_known_nonzero(i1 %c, i32 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $122, %ecx
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnel %eax, %ecx
 ; X86-NEXT:    rep bsfl %ecx, %eax
 ; X86-NEXT:    retl
@@ -102,9 +102,9 @@ define i32 @select_maybe_zero(i1 %c, i32 %x) {
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnel %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: select_maybe_zero:
@@ -125,13 +125,13 @@ define i32 @select_maybe_zero(i1 %c, i32 %x) {
 define i32 @extractelt_nonzero_vec(<4 x i32> %a0, ptr %p1, i32 %a2) {
 ; X86-LABEL: extractelt_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pxor %xmm1, %xmm1
 ; X86-NEXT:    pcmpgtd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    por %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -156,19 +156,19 @@ define i32 @extractelt_nonzero_vec(<4 x i32> %a0, ptr %p1, i32 %a2) {
 define i32 @extractelt_nonzero_vec_fail0(<4 x i32> %a0, ptr %p1, i32 %a2) {
 ; X86-LABEL: extractelt_nonzero_vec_fail0:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pxor %xmm1, %xmm1
 ; X86-NEXT:    pcmpgtd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    por %xmm0, %xmm1
-; X86-NEXT:    movdqa %xmm1, (%ecx)
-; X86-NEXT:    bsfl (%ecx,%eax,4), %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm1, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl (%eax,%ecx,4), %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractelt_nonzero_vec_fail0:
@@ -214,13 +214,13 @@ define i32 @shl_known_nonzero_1s_bit_set(i32 %x) {
 define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X86-LABEL: shl_known_nonzero_1s_bit_set_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pslld $23, %xmm0
 ; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    cvttps2dq %xmm0, %xmm0
 ; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [123,0,0,0]
 ; X86-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,0,4294967295,0]
 ; X86-NEXT:    pand %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -246,9 +246,9 @@ define i32 @shl_known_nonzero_1s_bit_set_vec(<4 x i32> %x, ptr %p) {
 define i32 @shl_known_nonzero_nsw(i32 %x, i32 %yy) {
 ; X86-LABEL: shl_known_nonzero_nsw:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -270,7 +270,6 @@ define i32 @shl_known_nonzero_nsw(i32 %x, i32 %yy) {
 define i32 @shl_known_nonzero_nsw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86-LABEL: shl_known_nonzero_nsw_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pslld $23, %xmm0
@@ -282,6 +281,7 @@ define i32 @shl_known_nonzero_nsw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86-NEXT:    pmuludq %xmm2, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -309,9 +309,9 @@ define i32 @shl_known_nonzero_nsw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 define i32 @shl_known_nonzero_nuw(i32 %x, i32 %yy) {
 ; X86-LABEL: shl_known_nonzero_nuw:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -333,7 +333,6 @@ define i32 @shl_known_nonzero_nuw(i32 %x, i32 %yy) {
 define i32 @shl_known_nonzero_nuw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86-LABEL: shl_known_nonzero_nuw_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pslld $23, %xmm0
@@ -345,6 +344,7 @@ define i32 @shl_known_nonzero_nuw_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86-NEXT:    pmuludq %xmm2, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -375,9 +375,9 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: shl_maybe_zero:
@@ -396,10 +396,10 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) {
 define i32 @uaddsat_known_nonzero(i32 %x) {
 ; X86-LABEL: uaddsat_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    incl %eax
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    incl %ecx
+; X86-NEXT:    cmovel %eax, %ecx
 ; X86-NEXT:    rep bsfl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -418,8 +418,8 @@ define i32 @uaddsat_known_nonzero(i32 %x) {
 define i32 @uaddsat_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 ; X86-LABEL: uaddsat_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    paddusb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -443,13 +443,13 @@ define i32 @uaddsat_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: uaddsat_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovael %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uaddsat_maybe_zero:
@@ -468,13 +468,13 @@ define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) {
 define i32 @umax_known_nonzero(i32 %x, i32 %y) {
 ; X86-LABEL: umax_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $4, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmoval %eax, %edx
-; X86-NEXT:    rep bsfl %edx, %eax
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umax_known_nonzero:
@@ -496,7 +496,6 @@ define i32 @umax_known_nonzero(i32 %x, i32 %y) {
 define i32 @umax_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 ; X86-LABEL: umax_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, %xmm3
 ; X86-NEXT:    psllw $5, %xmm3
 ; X86-NEXT:    pxor %xmm2, %xmm2
@@ -523,6 +522,7 @@ define i32 @umax_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 ; X86-NEXT:    pand %xmm2, %xmm1
 ; X86-NEXT:    por %xmm3, %xmm1
 ; X86-NEXT:    pmaxub %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -561,9 +561,9 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    cmoval %ecx, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umax_maybe_zero:
@@ -581,14 +581,14 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) {
 define i32 @umin_known_nonzero(i32 %xx, i32 %yy) {
 ; X86-LABEL: umin_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $4, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    addl $4, %eax
-; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    cmovbl %edx, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl $4, %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    rep bsfl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umin_known_nonzero:
@@ -612,7 +612,6 @@ define i32 @umin_known_nonzero(i32 %xx, i32 %yy) {
 define i32 @umin_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 ; X86-LABEL: umin_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, %xmm3
 ; X86-NEXT:    psllw $5, %xmm3
 ; X86-NEXT:    pxor %xmm2, %xmm2
@@ -639,11 +638,12 @@ define i32 @umin_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 ; X86-NEXT:    pand %xmm2, %xmm1
 ; X86-NEXT:    por %xmm3, %xmm1
 ; X86-NEXT:    pminub %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movzbl (%eax), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umin_known_nonzero_vec:
@@ -676,13 +676,13 @@ define i32 @umin_known_nonzero_vec(<16 x i8> %x, ptr %p) {
 define i32 @umin_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: umin_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $54, %eax
-; X86-NEXT:    movl $54, %ecx
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $54, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $54, %ecx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: umin_maybe_zero:
@@ -701,14 +701,14 @@ define i32 @umin_maybe_zero(i32 %x, i32 %y) {
 define i32 @smin_known_nonzero(i32 %xx, i32 %yy) {
 ; X86-LABEL: smin_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $4, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    addl $4, %eax
-; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    cmovll %edx, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl $4, %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    rep bsfl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smin_known_nonzero:
@@ -732,11 +732,11 @@ define i32 @smin_known_nonzero(i32 %xx, i32 %yy) {
 define i32 @smin_known_zero(i32 %x, i32 %y) {
 ; X86-LABEL: smin_known_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $-54, %eax
-; X86-NEXT:    movl $-54, %ecx
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    movl $-54, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $-54, %ecx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smin_known_zero:
@@ -788,13 +788,13 @@ define <4 x i32> @smin_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 define i32 @smin_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: smin_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $54, %eax
-; X86-NEXT:    movl $54, %ecx
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $54, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $54, %ecx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smin_maybe_zero:
@@ -839,14 +839,14 @@ define i32 @smin_known_never_zero_vec_element(<4 x i32> %x) {
 define i32 @smax_known_nonzero(i32 %xx, i32 %yy) {
 ; X86-LABEL: smax_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $4, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    addl $4, %eax
-; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    cmovgl %edx, %eax
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl $4, %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    rep bsfl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smax_known_nonzero:
@@ -870,11 +870,11 @@ define i32 @smax_known_nonzero(i32 %xx, i32 %yy) {
 define i32 @smax_maybe_zero(i32 %x, i32 %y) {
 ; X86-LABEL: smax_maybe_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $55, %eax
-; X86-NEXT:    movl $54, %ecx
-; X86-NEXT:    cmovgel %eax, %ecx
-; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    movl $54, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $55, %ecx
+; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smax_maybe_zero:
@@ -926,13 +926,13 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 define i32 @smax_known_zero(i32 %x, i32 %y) {
 ; X86-LABEL: smax_known_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovnsl %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    cmovnsl %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: smax_known_zero:
@@ -976,9 +976,9 @@ define i32 @smax_known_never_zero_vec_element(<4 x i32> %x) {
 define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
 ; X86-LABEL: rotr_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    rorl %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -1006,9 +1006,9 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rorl %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotr_maybe_zero:
@@ -1030,9 +1030,9 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
 define i32 @rotr_with_fshr_known_nonzero(i32 %xx, i32 %y) {
 ; X86-LABEL: rotr_with_fshr_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    rorl %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -1054,15 +1054,14 @@ define i32 @rotr_with_fshr_known_nonzero(i32 %xx, i32 %y) {
 define i32 @rotr_with_fshr_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) {
 ; X86-LABEL: rotr_with_fshr_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pxor %xmm2, %xmm2
+; X86-NEXT:    psubd %xmm1, %xmm2
+; X86-NEXT:    pslld $23, %xmm2
+; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-NEXT:    cvttps2dq %xmm2, %xmm1
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    pxor %xmm3, %xmm3
-; X86-NEXT:    psubd %xmm1, %xmm3
-; X86-NEXT:    pslld $23, %xmm3
-; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-NEXT:    cvttps2dq %xmm3, %xmm1
 ; X86-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -1073,6 +1072,7 @@ define i32 @rotr_with_fshr_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    por %xmm3, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -1114,9 +1114,9 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    rorl %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotr_with_fshr_maybe_zero:
@@ -1135,9 +1135,9 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
 define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
 ; X86-LABEL: rotl_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    roll %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -1165,9 +1165,9 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotl_maybe_zero:
@@ -1189,9 +1189,9 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
 define i32 @rotl_with_fshl_known_nonzero(i32 %xx, i32 %y) {
 ; X86-LABEL: rotl_with_fshl_known_nonzero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    roll %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -1213,7 +1213,6 @@ define i32 @rotl_with_fshl_known_nonzero(i32 %xx, i32 %y) {
 define i32 @rotl_with_fshl_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) {
 ; X86-LABEL: rotl_with_fshl_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pslld $23, %xmm1
@@ -1230,6 +1229,7 @@ define i32 @rotl_with_fshl_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    por %xmm3, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -1269,9 +1269,9 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: rotl_with_fshl_maybe_zero:
@@ -1314,9 +1314,9 @@ define i32 @sra_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    xorps %xmm1, %xmm1
 ; X86-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa {{.*#+}} xmm0 = [2147606891,65535,1,0]
 ; X86-NEXT:    psrad %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -1342,9 +1342,9 @@ define i32 @sra_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 define i32 @sra_known_nonzero_exact(i32 %x, i32 %yy) {
 ; X86-LABEL: sra_known_nonzero_exact:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -1368,9 +1368,9 @@ define i32 @sra_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    xorps %xmm2, %xmm2
 ; X86-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    psrad %xmm2, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
@@ -1401,9 +1401,9 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sra_maybe_zero:
@@ -1446,9 +1446,9 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    xorps %xmm1, %xmm1
 ; X86-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,2147606891,0]
 ; X86-NEXT:    psrld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X86-NEXT:    movd %xmm0, %eax
@@ -1475,9 +1475,9 @@ define i32 @srl_known_nonzero_sign_bit_set_vec(<4 x i32> %x, ptr %p) {
 define i32 @srl_known_nonzero_exact(i32 %x, i32 %yy) {
 ; X86-LABEL: srl_known_nonzero_exact:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
@@ -1501,9 +1501,9 @@ define i32 @srl_known_nonzero_exact_vec(<4 x i32> %x, <4 x i32> %yy, ptr %p) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    xorps %xmm2, %xmm2
 ; X86-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    psrld %xmm2, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
 ; X86-NEXT:    movd %xmm0, %eax
@@ -1534,9 +1534,9 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: srl_maybe_zero:
@@ -1579,10 +1579,7 @@ define i32 @udiv_known_nonzero(i32 %xx, i32 %y) {
 define i32 @udiv_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) nounwind {
 ; X86-LABEL: udiv_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
 ; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    movl $-1, %eax
@@ -1590,6 +1587,7 @@ define i32 @udiv_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) nounwind
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    movd %xmm1, %ecx
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
@@ -1597,27 +1595,27 @@ define i32 @udiv_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) nounwind
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm3, %edi
+; X86-NEXT:    movd %xmm3, %esi
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
 ; X86-NEXT:    movd %xmm3, %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
+; X86-NEXT:    divl %esi
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X86-NEXT:    movd %xmm1, %edi
+; X86-NEXT:    movd %xmm1, %esi
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %edi
+; X86-NEXT:    divl %esi
 ; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X86-NEXT:    movdqa %xmm2, (%esi)
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm2, (%eax)
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: udiv_known_nonzero_vec:
@@ -1663,9 +1661,9 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl {{[0-9]+}}(%esp)
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: udiv_maybe_zero:
@@ -1709,10 +1707,7 @@ define i32 @sdiv_known_nonzero(i32 %xx, i32 %y) {
 define i32 @sdiv_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) nounwind {
 ; X86-LABEL: sdiv_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
 ; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    movl $-1, %eax
@@ -1720,6 +1715,7 @@ define i32 @sdiv_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) nounwind
 ; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    movd %xmm1, %ecx
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %ecx
@@ -1727,27 +1723,27 @@ define i32 @sdiv_known_nonzero_vec(<4 x i32> %xx, <4 x i32> %y, ptr %p) nounwind
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; X86-NEXT:    movd %xmm3, %edi
+; X86-NEXT:    movd %xmm3, %esi
 ; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
 ; X86-NEXT:    movd %xmm3, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
+; X86-NEXT:    idivl %esi
 ; X86-NEXT:    movd %eax, %xmm3
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X86-NEXT:    movd %xmm1, %edi
+; X86-NEXT:    movd %xmm1, %esi
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %edi
+; X86-NEXT:    idivl %esi
 ; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X86-NEXT:    movdqa %xmm2, (%esi)
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm2, (%eax)
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sdiv_known_nonzero_vec:
@@ -1793,9 +1789,9 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl {{[0-9]+}}(%esp)
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sdiv_maybe_zero:
@@ -1839,9 +1835,9 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl $1, %eax
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: add_maybe_zero:
@@ -1860,8 +1856,8 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) {
 define i32 @add_nuw_known_nonzero_vec(<4 x i32> %xx, ptr %p) {
 ; X86-LABEL: add_nuw_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -1939,9 +1935,9 @@ define i32 @sub_maybe_zero(i32 %x) {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    orl $64, %ecx
 ; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_maybe_zero:
@@ -1963,9 +1959,9 @@ define i32 @sub_maybe_zero2(i32 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_maybe_zero2:
@@ -1982,10 +1978,10 @@ define i32 @sub_maybe_zero2(i32 %x) {
 define i32 @sub_known_nonzero_ne_vec(<4 x i32> %xx, ptr %p) {
 ; X86-LABEL: sub_known_nonzero_ne_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movd {{.*#+}} xmm1 = [2,0,0,0]
 ; X86-NEXT:    psubd %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -2014,9 +2010,9 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_known_nonzero_nsw:
@@ -2038,9 +2034,9 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_known_nonzero_nuw:
@@ -2061,9 +2057,9 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_maybe_zero:
@@ -2088,9 +2084,9 @@ define i32 @bitcast_known_nonzero(<2 x i16> %xx) {
 ; X86-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X86-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,256,u,u,u,u,u,u]
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bitcast_known_nonzero:
@@ -2115,9 +2111,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
 ; X86-LABEL: bitcast_maybe_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bitcast_maybe_zero:
@@ -2134,9 +2130,9 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
 define i32 @bitcast_from_float(float %x) {
 ; X86-LABEL: bitcast_from_float:
 ; X86:       # %bb.0:
-; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bitcast_from_float:
@@ -2178,12 +2174,12 @@ define i32 @abs_known_nonzero(i32 %xx) {
 define i32 @abs_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-LABEL: abs_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrad $31, %xmm1
 ; X86-NEXT:    pxor %xmm1, %xmm0
 ; X86-NEXT:    psubd %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -2212,9 +2208,9 @@ define i32 @abs_maybe_zero(i32 %x) {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    cmovsl %eax, %ecx
-; X86-NEXT:    bsfl %ecx, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %edx
+; X86-NEXT:    bsfl %ecx, %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: abs_maybe_zero:
@@ -2254,7 +2250,6 @@ define i32 @bswap_known_nonzero(i32 %xx) {
 define i32 @bswap_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-LABEL: bswap_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pxor %xmm1, %xmm1
 ; X86-NEXT:    movdqa %xmm0, %xmm2
@@ -2265,6 +2260,7 @@ define i32 @bswap_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
 ; X86-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; X86-NEXT:    packuswb %xmm2, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -2291,9 +2287,9 @@ define i32 @bswap_maybe_zero(i32 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bswap_maybe_zero:
@@ -2364,7 +2360,6 @@ define i32 @bitreverse_known_nonzero(i32 %xx) {
 define i32 @bitreverse_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-LABEL: bitreverse_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pxor %xmm1, %xmm1
 ; X86-NEXT:    movdqa %xmm0, %xmm2
@@ -2396,6 +2391,7 @@ define i32 @bitreverse_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-NEXT:    pand %xmm2, %xmm0
 ; X86-NEXT:    paddb %xmm0, %xmm0
 ; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -2447,9 +2443,9 @@ define i32 @bitreverse_maybe_zero(i32 %x) {
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bitreverse_maybe_zero:
@@ -2532,7 +2528,6 @@ define i32 @ctpop_known_nonzero(i32 %xx) {
 define i32 @ctpop_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-LABEL: ctpop_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrlw $1, %xmm1
@@ -2555,6 +2550,7 @@ define i32 @ctpop_known_nonzero_vec(<4 x i32> %xx, ptr %p) nounwind {
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    psadbw %xmm0, %xmm1
 ; X86-NEXT:    packuswb %xmm2, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm1, (%eax)
 ; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
@@ -2608,9 +2604,9 @@ define i32 @ctpop_maybe_zero(i32 %x) {
 ; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
 ; X86-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
 ; X86-NEXT:    shrl $24, %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ctpop_maybe_zero:
@@ -2664,10 +2660,10 @@ define i32 @zext_known_nonzero(i16 %xx) {
 define i32 @zext_maybe_zero(i16 %x) {
 ; X86-LABEL: zext_maybe_zero:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $32, %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: zext_maybe_zero:
@@ -2707,8 +2703,9 @@ define i32 @sext_known_nonzero(i16 %xx) {
 define i32 @sext_known_nonzero_vec(<8 x i16> %xx, ptr %p) {
 ; X86-LABEL: sext_known_nonzero_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pxor %xmm1, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm1, 16(%eax)
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X86-NEXT:    pslld $23, %xmm0
 ; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
@@ -2718,7 +2715,6 @@ define i32 @sext_known_nonzero_vec(<8 x i16> %xx, ptr %p) {
 ; X86-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,2,4,5,6,7]
 ; X86-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
 ; X86-NEXT:    psrad $16, %xmm0
-; X86-NEXT:    movdqa %xmm1, 16(%eax)
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
@@ -2753,10 +2749,10 @@ define i32 @sext_known_nonzero_vec(<8 x i16> %xx, ptr %p) {
 define i32 @sext_maybe_zero(i16 %x) {
 ; X86-LABEL: sext_maybe_zero:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $32, %ecx
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sext_maybe_zero:
@@ -2773,7 +2769,6 @@ define i32 @sext_maybe_zero(i16 %x) {
 define i32 @test_zext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X86-LABEL: test_zext_demanded_elts:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pxor %xmm1, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pcmpgtd %xmm0, %xmm2
@@ -2782,11 +2777,12 @@ define i32 @test_zext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-NEXT:    por %xmm0, %xmm2
 ; X86-NEXT:    movdqa %xmm2, %xmm0
-; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm0, 16(%eax)
 ; X86-NEXT:    movd %xmm2, %ecx
-; X86-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-NEXT:    movdqa %xmm2, 16(%eax)
-; X86-NEXT:    movdqa %xmm0, (%eax)
+; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT:    movdqa %xmm2, (%eax)
 ; X86-NEXT:    bsfl %ecx, %ecx
 ; X86-NEXT:    movl $64, %eax
 ; X86-NEXT:    cmovnel %ecx, %eax
@@ -2820,7 +2816,6 @@ define i32 @test_zext_demanded_elts(<4 x i32> %a0, ptr %p) {
 define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X86-LABEL: test_sext_demanded_elts:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pxor %xmm1, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pcmpgtd %xmm0, %xmm2
@@ -2828,14 +2823,15 @@ define i32 @test_sext_demanded_elts(<4 x i32> %a0, ptr %p) {
 ; X86-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-NEXT:    por %xmm0, %xmm2
-; X86-NEXT:    pcmpgtd %xmm2, %xmm1
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqa %xmm0, 16(%eax)
+; X86-NEXT:    pcmpgtd %xmm2, %xmm1
 ; X86-NEXT:    movd %xmm2, %ecx
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT:    movdqa %xmm0, 16(%eax)
 ; X86-NEXT:    movdqa %xmm2, (%eax)
-; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %ecx, %edx
+; X86-NEXT:    movd %xmm1, %eax
 ; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    addl $32, %eax
 ; X86-NEXT:    testl %ecx, %ecx
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 5368934fa5bf1..82ac7cb850921 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -26,9 +26,9 @@ define <2 x double> @signbits_sext_v2i64_sitofp_v2f64(i32 %a0, i32 %a1) nounwind
 define <4 x float> @signbits_sext_v4i64_sitofp_v4f32(i8 signext %a0, i16 signext %a1, i32 %a2, i32 %a3) nounwind {
 ; X86-LABEL: signbits_sext_v4i64_sitofp_v4f32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovd %ecx, %xmm0
 ; X86-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -264,9 +264,9 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
 define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
 ; X86-LABEL: signbits_sext_shuffle_sitofp:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X86-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -418,24 +418,24 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovapd 8(%ebp), %xmm3
-; X86-NEXT:    vpsrad $31, %xmm2, %xmm4
-; X86-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; X86-NEXT:    vpsrad $1, %xmm5, %xmm5
-; X86-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; X86-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X86-NEXT:    vpsrad $31, %xmm2, %xmm5
+; X86-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-NEXT:    vpcmpeqq %xmm3, %xmm4, %xmm3
+; X86-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; X86-NEXT:    vpsrad $31, %xmm4, %xmm5
+; X86-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-NEXT:    vpsrad $1, %xmm4, %xmm4
+; X86-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; X86-NEXT:    vmovapd 8(%ebp), %xmm5
+; X86-NEXT:    vshufps {{.*#+}} xmm6 = xmm5[2,2,3,3]
+; X86-NEXT:    vblendvpd %xmm3, %xmm4, %xmm6, %xmm3
+; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vpsrad $31, %xmm2, %xmm1
 ; X86-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-NEXT:    vpsrad $1, %xmm2, %xmm2
-; X86-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; X86-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
-; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm6
-; X86-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vblendvpd %xmm0, %xmm2, %xmm5, %xmm0
-; X86-NEXT:    vblendvpd %xmm6, %xmm4, %xmm3, %xmm1
-; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm5, %xmm0
+; X86-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; X86-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -712,12 +712,12 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8 immarg
 define void @cross_bb_signbits_insert_subvec(ptr %ptr, <32 x i8> %x, <32 x i8> %z) {
 ; X86-LABEL: cross_bb_signbits_insert_subvec:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; X86-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; X86-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
 ; X86-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm0
 ; X86-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vandnps %ymm1, %ymm0, %ymm1
 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll
index 7bef94cca0d35..722a7d3d1fb9e 100644
--- a/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll
+++ b/llvm/test/CodeGen/X86/lack-of-signed-truncation-check.ll
@@ -108,11 +108,10 @@ define i1 @shifts_necmp_i64_i16(i64 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movswl %ax, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
@@ -133,11 +132,10 @@ define i1 @shifts_necmp_i64_i8(i64 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsbl %al, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/ldexp-libcall.ll b/llvm/test/CodeGen/X86/ldexp-libcall.ll
index 74256c801d029..54f01976f35bc 100644
--- a/llvm/test/CodeGen/X86/ldexp-libcall.ll
+++ b/llvm/test/CodeGen/X86/ldexp-libcall.ll
@@ -10,9 +10,9 @@ define float @call_ldexpf(float %a, i32 %b) nounwind {
 ; CHECK-WIN-LABEL: call_ldexpf:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $8, %esp
-; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstps (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexpf
 ; CHECK-WIN-NEXT:    addl $8, %esp
@@ -29,9 +29,9 @@ define double @call_ldexp(double %a, i32 %b) nounwind {
 ; CHECK-WIN-LABEL: call_ldexp:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $12, %esp
-; CHECK-WIN-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstpl (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexp
 ; CHECK-WIN-NEXT:    addl $12, %esp
@@ -56,9 +56,9 @@ define x86_fp80 @call_ldexpl(x86_fp80 %a, i32 %b) nounwind {
 ; CHECK-WIN-NEXT:    movl %esp, %ebp
 ; CHECK-WIN-NEXT:    andl $-16, %esp
 ; CHECK-WIN-NEXT:    subl $48, %esp
-; CHECK-WIN-NEXT:    fldt 8(%ebp)
 ; CHECK-WIN-NEXT:    movl 24(%ebp), %eax
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    fldt 8(%ebp)
 ; CHECK-WIN-NEXT:    fstpt (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexpl
 ; CHECK-WIN-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/ldexp-not-readonly.ll b/llvm/test/CodeGen/X86/ldexp-not-readonly.ll
index 9a67cf8b31714..bfecb358b8ff7 100644
--- a/llvm/test/CodeGen/X86/ldexp-not-readonly.ll
+++ b/llvm/test/CodeGen/X86/ldexp-not-readonly.ll
@@ -13,9 +13,9 @@ define float @call_ldexpf(float %a, i32 %b) nounwind {
 ; CHECK-WIN-LABEL: call_ldexpf:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $8, %esp
-; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstps (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexpf
 ; CHECK-WIN-NEXT:    addl $8, %esp
@@ -35,9 +35,9 @@ define double @call_ldexp(double %a, i32 %b) nounwind {
 ; CHECK-WIN-LABEL: call_ldexp:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $12, %esp
-; CHECK-WIN-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstpl (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexp
 ; CHECK-WIN-NEXT:    addl $12, %esp
diff --git a/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll b/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll
index b4ba53f80ad5b..6db351b74848e 100644
--- a/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll
+++ b/llvm/test/CodeGen/X86/ldexp-wrong-signature.ll
@@ -13,11 +13,11 @@ define float @ldexpf_too_many_args(float %a, i32 %b, i32 %c) nounwind {
 ; CHECK-WIN-LABEL: ldexpf_too_many_args:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $12, %esp
-; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-WIN-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstps (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexpf
 ; CHECK-WIN-NEXT:    addl $12, %esp
@@ -37,9 +37,9 @@ define float @ldexp_wrong_fp_type(float %a, i32 %b) nounwind {
 ; CHECK-WIN-LABEL: ldexp_wrong_fp_type:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $8, %esp
-; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-WIN-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstps (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexp
 ; CHECK-WIN-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll b/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll
index d48c1c2e0a9a7..cdaaa6256a31c 100644
--- a/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll
+++ b/llvm/test/CodeGen/X86/ldexp-wrong-signature2.ll
@@ -12,8 +12,10 @@ define i32 @ldexpf_not_fp(i32 %a, i32 %b) nounwind {
 ;
 ; CHECK-WIN-LABEL: ldexpf_not_fp:
 ; CHECK-WIN:       # %bb.0:
-; CHECK-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-WIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-WIN-NEXT:    pushl %eax
+; CHECK-WIN-NEXT:    pushl %ecx
 ; CHECK-WIN-NEXT:    calll _ldexpf
 ; CHECK-WIN-NEXT:    addl $8, %esp
 ; CHECK-WIN-NEXT:    retl
@@ -33,8 +35,8 @@ define float @ldexp_not_int(float %a, float %b) nounwind {
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    subl $8, %esp
 ; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-WIN-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-WIN-NEXT:    fstps (%esp)
 ; CHECK-WIN-NEXT:    calll _ldexp
 ; CHECK-WIN-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index 59ec7bfcaa910..ad2b24350e921 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -133,15 +133,14 @@ define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) nounwind {
 ;
 ; WIN32-LABEL: ldexp_v2f32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $20, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
-; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -150,7 +149,6 @@ define <2 x float> @ldexp_v2f32(<2 x float> %val, <2 x i32> %exp) nounwind {
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl $20, %esp
-; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %val, <2 x i32> %exp)
   ret <2 x float> %1
@@ -244,54 +242,48 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) nounwind {
 ;
 ; WIN32-LABEL: ldexp_v4f32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %ebp
-; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $44, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
-; WIN32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 12(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 8(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps 12(%esi)
-; WIN32-NEXT:    fstps 8(%esi)
-; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    fstps (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    addl $44, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
-; WIN32-NEXT:    popl %ebx
-; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
   %1 = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %val, <4 x i32> %exp)
   ret <4 x float> %1
@@ -345,25 +337,21 @@ define <2 x double> @ldexp_v2f64(<2 x double> %val, <2 x i32> %exp) nounwind {
 ;
 ; WIN32-LABEL: ldexp_v2f64:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $28, %esp
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
+; WIN32-NEXT:    subl $20, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fxch %st(1)
-; WIN32-NEXT:    addl $28, %esp
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    retl
   %1 = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %val, <2 x i32> %exp)
   ret <2 x double> %1
@@ -456,54 +444,40 @@ define <4 x double> @ldexp_v4f64(<4 x double> %val, <4 x i32> %exp) nounwind {
 ;
 ; WIN32-LABEL: ldexp_v4f64:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %ebp
-; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $44, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
+; WIN32-NEXT:    subl $28, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
-; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
-; WIN32-NEXT:    calll _ldexp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    fstpl 24(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstpl 16(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstpl 8(%esi)
-; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstpl (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    addl $44, %esp
+; WIN32-NEXT:    addl $28, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
-; WIN32-NEXT:    popl %ebx
-; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
   %1 = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %val, <4 x i32> %exp)
   ret <4 x double> %1
@@ -537,13 +511,12 @@ define half @ldexp_f16(half %arg0, i32 %arg1) nounwind {
 ;
 ; WIN32-LABEL: ldexp_f16:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, (%esp)
 ; WIN32-NEXT:    calll ___extendhfsf2
-; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _ldexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -551,7 +524,6 @@ define half @ldexp_f16(half %arg0, i32 %arg1) nounwind {
 ; WIN32-NEXT:    fstps (%esp)
 ; WIN32-NEXT:    calll ___truncsfhf2
 ; WIN32-NEXT:    addl $16, %esp
-; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    retl
   %ldexp = call half @llvm.ldexp.f16.i32(half %arg0, i32 %arg1)
   ret half %ldexp
diff --git a/llvm/test/CodeGen/X86/lea-2.ll b/llvm/test/CodeGen/X86/lea-2.ll
index 0883a8e726e25..3329e95d4c1ea 100644
--- a/llvm/test/CodeGen/X86/lea-2.ll
+++ b/llvm/test/CodeGen/X86/lea-2.ll
@@ -30,10 +30,10 @@ define i32 @test1(i32 %A, i32 %B) {
 define i64 @test2(i32 %a0, i64 %a1) {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl $2147483640, %eax # imm = 0x7FFFFFF8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shrl $31, %edx
+; X86-NEXT:    andl $2147483640, %eax # imm = 0x7FFFFFF8
 ; X86-NEXT:    leal 4(%eax,%eax), %eax
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
diff --git a/llvm/test/CodeGen/X86/lea-opt-cse1.ll b/llvm/test/CodeGen/X86/lea-opt-cse1.ll
index 5ceca9fbd9b5f..f7cae2ac4d88e 100644
--- a/llvm/test/CodeGen/X86/lea-opt-cse1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-cse1.ll
@@ -25,8 +25,8 @@ define void @test_func(ptr nocapture %ctx, i32 %n) local_unnamed_addr {
 ; X86-NEXT:    movl (%eax), %ecx
 ; X86-NEXT:    movl 16(%eax), %edx
 ; X86-NEXT:    leal 1(%ecx,%edx), %esi
-; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    leal 1(%edx,%ecx), %ecx
 ; X86-NEXT:    movl %ecx, 16(%eax)
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/lea-opt-cse3.ll b/llvm/test/CodeGen/X86/lea-opt-cse3.ll
index 93e4fa77b5629..6f7c276a45707 100644
--- a/llvm/test/CodeGen/X86/lea-opt-cse3.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-cse3.ll
@@ -16,8 +16,8 @@ define i32 @foo(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal 4(%ecx,%eax,2), %edx
-; X86-NEXT:    leal 4(%ecx,%eax,4), %eax
+; X86-NEXT:    leal 4(%ecx,%eax,4), %edx
+; X86-NEXT:    leal 4(%ecx,%eax,2), %eax
 ; X86-NEXT:    imull %edx, %eax
 ; X86-NEXT:    retl
 entry:
@@ -44,8 +44,8 @@ define i32 @foo1(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal 4(%ecx,%eax,4), %edx
-; X86-NEXT:    leal 4(%ecx,%eax,8), %eax
+; X86-NEXT:    leal 4(%ecx,%eax,8), %edx
+; X86-NEXT:    leal 4(%ecx,%eax,4), %eax
 ; X86-NEXT:    imull %edx, %eax
 ; X86-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/lea-opt-cse4.ll b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
index 4fa9acd99bb2f..c62a7d2432f8c 100644
--- a/llvm/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
@@ -31,8 +31,8 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    leal 1(%ecx,%edx), %esi
-; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    leal 1(%ecx,%edx), %ecx
 ; X86-NEXT:    movl %ecx, 16(%eax)
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index 96780af0d0fd1..01089db06ea4c 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
 
 ; PR26575
@@ -13,19 +14,28 @@ declare <4 x float> @_mm_castsi128_ps(<2 x i64>) optsize
 
 ; Check that the LEA optimization pass works with CPI address displacements.
 define void @test1(ptr nocapture readonly %src, i32 %len) #0 {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %esp, %edx
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    calll _memcpy
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
+; CHECK-NEXT:    calll __mm_xor_si128
+; CHECK-NEXT:    pclmulqdq $16, __xmm at 0000000154442bd400000001c6e41596, %xmm0
+; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    jmp __mm_castsi128_ps # TAILCALL
   %parts = alloca [4 x i32], align 4
   call void @llvm.memcpy.p0.p0.i32(ptr %parts, ptr %src, i32 %len, i1 false)
   %call0 = tail call <2 x i64> @_mm_xor_si128(<2 x i64> undef, <2 x i64> <i64 -9187201950435737472, i64 -9187201950435737472>)
   %tmp0 = tail call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> undef, <2 x i64> <i64 7631803798, i64 5708721108>, i8 16)
   %call1 = tail call <4 x float> @_mm_castsi128_ps(<2 x i64> %tmp0)
   ret void
-; CHECK-LABEL: test1:
-; CHECK:	movl %esp,
-; CHECK:	calll _memcpy
-; CHECK:	movaps __xmm@{{[0-9a-f]+}}, %xmm1
-; CHECK:	calll __mm_xor_si128
-; CHECK:	pclmulqdq $16, __xmm@{{[0-9a-f]+}}, %xmm0
-; CHECK:	jmp __mm_castsi128_ps
 }
 
 declare i32 @GetLastError(...)
@@ -38,6 +48,16 @@ declare ptr @llvm.localrecover(ptr, ptr, i32)
 
 ; Check that the MCSymbol objects are created to be used in "\01?fin$0 at 0@test2@@".
 define void @test2() #0 {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:  Ltest2$frame_escape_0 = 8
+; CHECK-NEXT:  Ltest2$frame_escape_1 = 4
+; CHECK-NEXT:  Ltest2$frame_escape_2 = 0
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    calll "?fin$0 at 0@test2@@"
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
 entry:
   %fActivateActCtxSuccess = alloca i32, align 4
   %proc = alloca i32, align 4
@@ -46,11 +66,6 @@ entry:
   %tmp0 = tail call ptr @llvm.localaddress()
   call fastcc void @"\01?fin$0 at 0@test2@@"(ptr %tmp0)
   ret void
-; CHECK-LABEL: test2:
-; CHECK:	Ltest2$frame_escape_0 = 8
-; CHECK:	Ltest2$frame_escape_1 = 4
-; CHECK:	Ltest2$frame_escape_2 = 0
-; CHECK:	calll "?fin$0 at 0@test2@@"
 }
 
 ; Check that the LEA optimization pass works with MCSymbol address displacements.
diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll
index 53208de7ea27e..ece331139d62e 100644
--- a/llvm/test/CodeGen/X86/legalize-shift-64.ll
+++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll
@@ -4,9 +4,9 @@
 define i64 @test1(i32 %xx, i32 %test) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    andb $7, %cl
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    shrl %edx
@@ -24,10 +24,10 @@ define i64 @test2(i64 %xx, i32 %test) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    andb $7, %cl
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    shldl %cl, %esi, %edx
@@ -42,10 +42,10 @@ define i64 @test2(i64 %xx, i32 %test) nounwind {
 define i64 @test3(i64 %xx, i32 %test) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    andb $7, %cl
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    shrdl %cl, %edx, %eax
 ; CHECK-NEXT:    shrl %cl, %edx
 ; CHECK-NEXT:    retl
@@ -58,10 +58,10 @@ define i64 @test3(i64 %xx, i32 %test) nounwind {
 define i64 @test4(i64 %xx, i32 %test) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    andb $7, %cl
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    shrdl %cl, %edx, %eax
 ; CHECK-NEXT:    sarl %cl, %edx
 ; CHECK-NEXT:    retl
@@ -75,55 +75,59 @@ define i64 @test4(i64 %xx, i32 %test) nounwind {
 define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 20
-; CHECK-NEXT:    .cfi_offset %esi, -20
-; CHECK-NEXT:    .cfi_offset %edi, -16
-; CHECK-NEXT:    .cfi_offset %ebx, -12
-; CHECK-NEXT:    .cfi_offset %ebp, -8
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %esi, -16
+; CHECK-NEXT:    .cfi_offset %edi, -12
+; CHECK-NEXT:    .cfi_offset %ebx, -8
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %ebx, %edi
-; CHECK-NEXT:    shll %cl, %edi
-; CHECK-NEXT:    shldl %cl, %ebx, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    shll %cl, %esi
+; CHECK-NEXT:    shldl %cl, %edx, %eax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    je .LBB4_2
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    jne .LBB4_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movl %edi, %esi
-; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    movl %edx, %ebx
-; CHECK-NEXT:    movb %ch, %cl
-; CHECK-NEXT:    shll %cl, %ebx
-; CHECK-NEXT:    shldl %cl, %edx, %ebp
-; CHECK-NEXT:    testb $32, %ch
-; CHECK-NEXT:    je .LBB4_4
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl $0, %ecx
+; CHECK-NEXT:    jne .LBB4_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    movl %ebx, %ebp
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:  .LBB4_4:
-; CHECK-NEXT:    movl %ebp, 12(%eax)
-; CHECK-NEXT:    movl %ebx, 8(%eax)
-; CHECK-NEXT:    movl %esi, 4(%eax)
-; CHECK-NEXT:    movl %edi, (%eax)
+; CHECK-NEXT:    movl %ecx, 8(%eax)
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl %ebx, %esi
+; CHECK-NEXT:    shll %cl, %esi
+; CHECK-NEXT:    shldl %cl, %ebx, %edi
+; CHECK-NEXT:    testb $32, %cl
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    jne .LBB4_6
+; CHECK-NEXT:  # %bb.5:
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:  .LBB4_6:
+; CHECK-NEXT:    movl %ecx, 4(%eax)
+; CHECK-NEXT:    jne .LBB4_8
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:  .LBB4_8:
+; CHECK-NEXT:    movl %edx, (%eax)
 ; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl $4
   %shl = shl <2 x i64> %A, %B
diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index 4cfb050958abb..ba56ef6fc943a 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -5,31 +5,32 @@
 define <2 x i256> @test_shl(<2 x i256> %In) nounwind {
 ; X86-LABEL: test_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shldl $2, %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edx, 60(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $2, %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $2, %esi, %ecx
 ; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    shldl $2, %edx, %esi
+; X86-NEXT:    movl %esi, 52(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shldl $2, %ecx, %edx
-; X86-NEXT:    movl %edx, 52(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $2, %edx, %ecx
-; X86-NEXT:    movl %ecx, 48(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shldl $2, %ecx, %edx
-; X86-NEXT:    movl %edx, 44(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $2, %esi, %edx
+; X86-NEXT:    movl %edx, 48(%eax)
+; X86-NEXT:    shldl $2, %ecx, %esi
+; X86-NEXT:    movl %esi, 44(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $2, %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $2, %esi, %ecx
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shldl $2, %ecx, %edx
-; X86-NEXT:    movl %edx, 36(%eax)
-; X86-NEXT:    shll $2, %ecx
-; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    shldl $2, %edx, %esi
+; X86-NEXT:    movl %esi, 36(%eax)
+; X86-NEXT:    shll $2, %edx
+; X86-NEXT:    movl %edx, 32(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    movl %ecx, 28(%eax)
@@ -40,6 +41,7 @@ define <2 x i256> @test_shl(<2 x i256> %In) nounwind {
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_shl:
@@ -70,43 +72,33 @@ define <2 x i256> @test_shl(<2 x i256> %In) nounwind {
 define <2 x i256> @test_srl(<2 x i256> %In) nounwind {
 ; X86-LABEL: test_srl:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $4, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shldl $28, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl $4, %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $28, %eax, %ebp
-; X86-NEXT:    shrdl $4, %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl $28, %eax, %esi
-; X86-NEXT:    shrdl $4, %eax, %edx
+; X86-NEXT:    movl %esi, 60(%eax)
+; X86-NEXT:    shldl $28, %ecx, %edx
+; X86-NEXT:    movl %edx, 56(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $28, %esi, %ecx
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    shldl $28, %edx, %esi
+; X86-NEXT:    movl %esi, 48(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrdl $4, %ecx, %ebx
-; X86-NEXT:    shrl $4, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %ebx, 56(%eax)
-; X86-NEXT:    movl %esi, 52(%eax)
-; X86-NEXT:    movl %edx, 48(%eax)
-; X86-NEXT:    movl %ebp, 44(%eax)
-; X86-NEXT:    movl %edi, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $28, %esi, %edx
+; X86-NEXT:    movl %edx, 44(%eax)
+; X86-NEXT:    shldl $28, %ecx, %esi
+; X86-NEXT:    movl %esi, 40(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $28, %esi, %ecx
 ; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    shrdl $4, %esi, %edx
+; X86-NEXT:    movl %edx, 32(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrl $31, %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
@@ -117,11 +109,7 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind {
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
-; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_srl:
@@ -152,43 +140,33 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind {
 define <2 x i256> @test_sra(<2 x i256> %In) nounwind {
 ; X86-LABEL: test_sra:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sarl $6, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shldl $26, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl $6, %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $26, %eax, %ebp
-; X86-NEXT:    shrdl $6, %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl $26, %eax, %esi
-; X86-NEXT:    shrdl $6, %eax, %edx
+; X86-NEXT:    movl %esi, 60(%eax)
+; X86-NEXT:    shldl $26, %ecx, %edx
+; X86-NEXT:    movl %edx, 56(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $26, %esi, %ecx
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    shldl $26, %edx, %esi
+; X86-NEXT:    movl %esi, 48(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrdl $6, %ecx, %ebx
-; X86-NEXT:    sarl $6, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %ebx, 56(%eax)
-; X86-NEXT:    movl %esi, 52(%eax)
-; X86-NEXT:    movl %edx, 48(%eax)
-; X86-NEXT:    movl %ebp, 44(%eax)
-; X86-NEXT:    movl %edi, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $26, %esi, %edx
+; X86-NEXT:    movl %edx, 44(%eax)
+; X86-NEXT:    shldl $26, %ecx, %esi
+; X86-NEXT:    movl %esi, 40(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $26, %esi, %ecx
 ; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    shrdl $6, %esi, %edx
+; X86-NEXT:    movl %edx, 32(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 28(%eax)
@@ -199,11 +177,7 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_sra:
diff --git a/llvm/test/CodeGen/X86/llrint-conv.ll b/llvm/test/CodeGen/X86/llrint-conv.ll
index 01551030d938a..dcebbf4c8b7f0 100644
--- a/llvm/test/CodeGen/X86/llrint-conv.ll
+++ b/llvm/test/CodeGen/X86/llrint-conv.ll
@@ -75,7 +75,8 @@ define i64 @test_llrint_i64_f32(float %x) nounwind {
 ;
 ; X86-NOX87-LABEL: test_llrint_i64_f32:
 ; X86-NOX87:       # %bb.0: # %entry
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
@@ -143,8 +144,10 @@ define i64 @test_llrint_i64_f64(double %x) nounwind {
 ;
 ; X86-NOX87-LABEL: test_llrint_i64_f64:
 ; X86-NOX87:       # %bb.0: # %entry
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    retl
@@ -213,9 +216,11 @@ define i64 @test_llrint_i64_f80(x86_fp80 %x) nounwind {
 ; X86-NOX87-LABEL: test_llrint_i64_f80:
 ; X86-NOX87:       # %bb.0: # %entry
 ; X86-NOX87-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOX87-NEXT:    pushl %eax
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    pushl %edx
+; X86-NOX87-NEXT:    pushl %ecx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $12, %esp
 ; X86-NOX87-NEXT:    retl
@@ -237,15 +242,21 @@ define i64 @test_llrint_i64_f128(fp128 %x) nounwind {
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $16, %esp
-; X86-NOSSE-NEXT:    pushl 20(%ebp)
-; X86-NOSSE-NEXT:    pushl 16(%ebp)
-; X86-NOSSE-NEXT:    pushl 12(%ebp)
-; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    movl 16(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 20(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 12(%ebp), %esi
+; X86-NOSSE-NEXT:    pushl %ecx
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    pushl %edx
 ; X86-NOSSE-NEXT:    calll llrintl
 ; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -253,15 +264,21 @@ define i64 @test_llrint_i64_f128(fp128 %x) nounwind {
 ; X86-NOX87:       # %bb.0: # %entry
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
+; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %ebp, %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -269,15 +286,21 @@ define i64 @test_llrint_i64_f128(fp128 %x) nounwind {
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl 16(%ebp), %eax
+; X86-SSE2-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll llrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -365,7 +388,8 @@ define i64 @test_llrint_i64_f32_strict(float %x) nounwind strictfp {
 ;
 ; X86-NOX87-LABEL: test_llrint_i64_f32_strict:
 ; X86-NOX87:       # %bb.0: # %entry
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
@@ -412,8 +436,10 @@ define i64 @test_llrint_i64_f64_strict(double %x) nounwind strictfp {
 ;
 ; X86-NOX87-LABEL: test_llrint_i64_f64_strict:
 ; X86-NOX87:       # %bb.0: # %entry
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    retl
@@ -461,9 +487,11 @@ define i64 @test_llrint_i64_f80_strict(x86_fp80 %x) nounwind strictfp {
 ; X86-NOX87-LABEL: test_llrint_i64_f80_strict:
 ; X86-NOX87:       # %bb.0: # %entry
 ; X86-NOX87-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOX87-NEXT:    pushl %eax
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    pushl %edx
+; X86-NOX87-NEXT:    pushl %ecx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $12, %esp
 ; X86-NOX87-NEXT:    retl
@@ -488,15 +516,21 @@ define i64 @test_llrint_i64_f128_strict(fp128 %x) nounwind strictfp {
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $16, %esp
-; X86-NOSSE-NEXT:    pushl 20(%ebp)
-; X86-NOSSE-NEXT:    pushl 16(%ebp)
-; X86-NOSSE-NEXT:    pushl 12(%ebp)
-; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    movl 16(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 20(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 12(%ebp), %esi
+; X86-NOSSE-NEXT:    pushl %ecx
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    pushl %edx
 ; X86-NOSSE-NEXT:    calll llrintl
 ; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -504,15 +538,21 @@ define i64 @test_llrint_i64_f128_strict(fp128 %x) nounwind strictfp {
 ; X86-NOX87:       # %bb.0: # %entry
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
+; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %ebp, %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -520,15 +560,21 @@ define i64 @test_llrint_i64_f128_strict(fp128 %x) nounwind strictfp {
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl 16(%ebp), %eax
+; X86-SSE2-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll llrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/llround-conv.ll b/llvm/test/CodeGen/X86/llround-conv.ll
index 93fec71b6eccc..ed46b8a8be2df 100644
--- a/llvm/test/CodeGen/X86/llround-conv.ll
+++ b/llvm/test/CodeGen/X86/llround-conv.ll
@@ -190,15 +190,21 @@ define i64 @test_llround_f128(fp128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llroundl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -418,15 +424,21 @@ define i64 @test_llround_i64_f128(fp128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llroundl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll
index e3a1b1b83b2e3..58b3daf271135 100644
--- a/llvm/test/CodeGen/X86/llvm.frexp.ll
+++ b/llvm/test/CodeGen/X86/llvm.frexp.ll
@@ -314,41 +314,41 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $44, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    leal 24(%esi), %eax
+; WIN32-NEXT:    leal 20(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    leal 20(%esi), %eax
+; WIN32-NEXT:    leal 24(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    leal 16(%esi), %eax
+; WIN32-NEXT:    leal 28(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
-; WIN32-NEXT:    leal 28(%esi), %eax
+; WIN32-NEXT:    leal 16(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 12(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 8(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps 12(%esi)
-; WIN32-NEXT:    fstps 8(%esi)
-; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    fstps (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    addl $44, %esp
@@ -394,7 +394,6 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $60, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -417,19 +416,20 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 12(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 8(%esi)
 ; WIN32-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
+; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    fstps 12(%esi)
-; WIN32-NEXT:    fstps 8(%esi)
-; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    fstps (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    addl $60, %esp
@@ -519,9 +519,9 @@ define { double, i32 } @test_frexp_f64_i32(double %a) nounwind {
 ; WIN32-LABEL: test_frexp_f64_i32:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -543,9 +543,9 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) nounwind {
 ; WIN32-LABEL: test_frexp_f64_i32_only_use_fract:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    addl $16, %esp
@@ -568,9 +568,9 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) nounwind {
 ; WIN32-LABEL: test_frexp_f64_i32_only_use_exp:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
index 9b02438952035..6f2e0a2e8f2a4 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
@@ -7,55 +7,47 @@
 define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
 ; X86-LABEL: test_sincos_v4f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl 84(%esp), %esi
-; X86-NEXT:    flds 76(%esp)
-; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    flds 64(%esp)
-; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    flds 72(%esp)
-; X86-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    flds 68(%esp)
-; X86-NEXT:    movl 80(%esp), %edi
-; X86-NEXT:    leal 40(%esp), %eax
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    leal 32(%esp), %eax
 ; X86-NEXT:    movl %eax, 8(%esp)
-; X86-NEXT:    leal 4(%edi), %eax
+; X86-NEXT:    movl 64(%esp), %esi
+; X86-NEXT:    leal 8(%esi), %eax
 ; X86-NEXT:    movl %eax, 4(%esp)
+; X86-NEXT:    flds 56(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll sincosf
-; X86-NEXT:    leal 44(%esp), %eax
+; X86-NEXT:    leal 28(%esp), %eax
 ; X86-NEXT:    movl %eax, 8(%esp)
-; X86-NEXT:    leal 8(%edi), %eax
+; X86-NEXT:    leal 4(%esi), %eax
 ; X86-NEXT:    movl %eax, 4(%esp)
-; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    flds 52(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll sincosf
 ; X86-NEXT:    leal 36(%esp), %eax
 ; X86-NEXT:    movl %eax, 8(%esp)
-; X86-NEXT:    movl %edi, 4(%esp)
-; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    leal 12(%esi), %eax
+; X86-NEXT:    movl %eax, 4(%esp)
+; X86-NEXT:    flds 60(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll sincosf
-; X86-NEXT:    leal 48(%esp), %eax
+; X86-NEXT:    leal 24(%esp), %eax
 ; X86-NEXT:    movl %eax, 8(%esp)
-; X86-NEXT:    addl $12, %edi
-; X86-NEXT:    movl %edi, 4(%esp)
-; X86-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 4(%esp)
+; X86-NEXT:    flds 48(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    calll sincosf
+; X86-NEXT:    movl 68(%esp), %eax
 ; X86-NEXT:    flds 36(%esp)
-; X86-NEXT:    flds 40(%esp)
-; X86-NEXT:    flds 44(%esp)
-; X86-NEXT:    flds 48(%esp)
-; X86-NEXT:    fstps 12(%esi)
-; X86-NEXT:    fstps 8(%esi)
-; X86-NEXT:    fstps 4(%esi)
-; X86-NEXT:    fstps (%esi)
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    fstps 12(%eax)
+; X86-NEXT:    flds 32(%esp)
+; X86-NEXT:    fstps 8(%eax)
+; X86-NEXT:    flds 28(%esp)
+; X86-NEXT:    fstps 4(%eax)
+; X86-NEXT:    flds 24(%esp)
+; X86-NEXT:    fstps (%eax)
+; X86-NEXT:    addl $40, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_sincos_v4f32:
@@ -219,33 +211,29 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias
 define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
 ; X86-LABEL: test_sincos_v2f64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movl 84(%esp), %esi
-; X86-NEXT:    fldl 72(%esp)
-; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; X86-NEXT:    fldl 64(%esp)
-; X86-NEXT:    movl 80(%esp), %edi
-; X86-NEXT:    leal 24(%esp), %eax
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    leal 32(%esp), %eax
 ; X86-NEXT:    movl %eax, 12(%esp)
-; X86-NEXT:    movl %edi, 8(%esp)
+; X86-NEXT:    movl 64(%esp), %esi
+; X86-NEXT:    leal 8(%esi), %eax
+; X86-NEXT:    movl %eax, 8(%esp)
+; X86-NEXT:    fldl 56(%esp)
 ; X86-NEXT:    fstpl (%esp)
 ; X86-NEXT:    calll sincos
-; X86-NEXT:    leal 32(%esp), %eax
+; X86-NEXT:    leal 24(%esp), %eax
 ; X86-NEXT:    movl %eax, 12(%esp)
-; X86-NEXT:    addl $8, %edi
-; X86-NEXT:    movl %edi, 8(%esp)
-; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT:    movl %esi, 8(%esp)
+; X86-NEXT:    fldl 48(%esp)
 ; X86-NEXT:    fstpl (%esp)
 ; X86-NEXT:    calll sincos
-; X86-NEXT:    fldl 24(%esp)
+; X86-NEXT:    movl 68(%esp), %eax
 ; X86-NEXT:    fldl 32(%esp)
-; X86-NEXT:    fstpl 8(%esi)
-; X86-NEXT:    fstpl (%esi)
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    fstpl 8(%eax)
+; X86-NEXT:    fldl 24(%esp)
+; X86-NEXT:    fstpl (%eax)
+; X86-NEXT:    addl $40, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_sincos_v2f64:
diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll
index f21c07599d6f1..6c7bd1cf80e2f 100644
--- a/llvm/test/CodeGen/X86/load-combine.ll
+++ b/llvm/test/CodeGen/X86/load-combine.ll
@@ -472,19 +472,19 @@ define i32 @load_i32_by_i8_bswap_store_in_between(ptr %arg, ptr %arg1) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    .cfi_offset %esi, -8
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl 1(%eax), %ecx
 ; CHECK-NEXT:    movzbl (%eax), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $0, (%esi)
 ; CHECK-NEXT:    shll $24, %edx
-; CHECK-NEXT:    movzbl 1(%eax), %esi
-; CHECK-NEXT:    movl $0, (%ecx)
-; CHECK-NEXT:    shll $16, %esi
-; CHECK-NEXT:    orl %edx, %esi
-; CHECK-NEXT:    movzbl 2(%eax), %ecx
-; CHECK-NEXT:    shll $8, %ecx
-; CHECK-NEXT:    orl %esi, %ecx
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movzbl 2(%eax), %edx
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    orl %ecx, %edx
 ; CHECK-NEXT:    movzbl 3(%eax), %eax
-; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    orl %edx, %eax
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
@@ -532,17 +532,17 @@ define i32 @load_i32_by_i8_bswap_unrelated_load(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movzbl (%ecx), %edx
-; CHECK-NEXT:    shll $24, %edx
-; CHECK-NEXT:    movzbl 1(%eax), %eax
-; CHECK-NEXT:    shll $16, %eax
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    movzbl 2(%ecx), %edx
-; CHECK-NEXT:    shll $8, %edx
-; CHECK-NEXT:    orl %eax, %edx
-; CHECK-NEXT:    movzbl 3(%ecx), %eax
-; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    shll $24, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movzbl 1(%edx), %edx
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    movzbl 2(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movzbl 3(%eax), %eax
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load:
@@ -894,7 +894,7 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    movl 12(%ecx,%eax), %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_base_offset_index:
@@ -939,7 +939,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 13(%eax,%ecx), %eax
+; CHECK-NEXT:    movl 13(%ecx,%eax), %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2:
@@ -995,7 +995,7 @@ define i32 @load_i32_by_i8_zaext_loads(ptr %arg, i32 %arg1) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    movl 12(%ecx,%eax), %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_zaext_loads:
@@ -1051,7 +1051,7 @@ define i32 @load_i32_by_i8_zsext_loads(ptr %arg, i32 %arg1) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    movl 12(%ecx,%eax), %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_zsext_loads:
@@ -1288,11 +1288,11 @@ define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl (%edx), %esi
-; CHECK-NEXT:    movzwl (%edx), %eax
-; CHECK-NEXT:    movl $0, (%ecx)
-; CHECK-NEXT:    movl %esi, (%edx)
+; CHECK-NEXT:    movzwl (%ecx), %eax
+; CHECK-NEXT:    movl (%ecx), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl $0, (%esi)
+; CHECK-NEXT:    movl %edx, (%ecx)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/localescape.ll b/llvm/test/CodeGen/X86/localescape.ll
index 57369be489af3..0095ce3101af4 100644
--- a/llvm/test/CodeGen/X86/localescape.ll
+++ b/llvm/test/CodeGen/X86/localescape.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
 ; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
 
@@ -10,6 +11,57 @@ declare i32 @printf(ptr, ...)
 @str = internal constant [10 x i8] c"asdf: %d\0A\00"
 
 define void @print_framealloc_from_fp(ptr %fp) {
+; X64-LABEL: print_framealloc_from_fp:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rsi
+; X64-NEXT:    .seh_pushreg %rsi
+; X64-NEXT:    pushq %rdi
+; X64-NEXT:    .seh_pushreg %rdi
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    movl .Lalloc_func$frame_escape_0(%rcx), %edx
+; X64-NEXT:    leaq str(%rip), %rdi
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    callq printf
+; X64-NEXT:    movl .Lalloc_func$frame_escape_1(%rsi), %edx
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    callq printf
+; X64-NEXT:    movl $42, .Lalloc_func$frame_escape_1(%rsi)
+; X64-NEXT:    movl $4, %eax
+; X64-NEXT:    movl .Lalloc_func$frame_escape_1(%rsi,%rax), %edx
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    callq printf
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    popq %rdi
+; X64-NEXT:    popq %rsi
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; X86-LABEL: print_framealloc_from_fp:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    pushl Lalloc_func$frame_escape_0(%esi)
+; X86-NEXT:    pushl $_str
+; X86-NEXT:    calll _printf
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    pushl Lalloc_func$frame_escape_1(%esi)
+; X86-NEXT:    pushl $_str
+; X86-NEXT:    calll _printf
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl $42, Lalloc_func$frame_escape_1(%esi)
+; X86-NEXT:    movl $4, %eax
+; X86-NEXT:    pushl Lalloc_func$frame_escape_1(%esi,%eax)
+; X86-NEXT:    pushl $_str
+; X86-NEXT:    calll _printf
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
   %a.i8 = call ptr @llvm.localrecover(ptr @alloc_func, ptr %fp, i32 0)
   %a.val = load i32, ptr %a.i8
   call i32 (ptr, ...) @printf(ptr @str, i32 %a.val)
@@ -56,6 +108,53 @@ define void @print_framealloc_from_fp(ptr %fp) {
 ; X86: retl
 
 define void @alloc_func(i32 %n) {
+; X64-LABEL: alloc_func:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    .seh_stackalloc 16
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    .seh_setframe %rbp, 16
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:  .Lalloc_func$frame_escape_0 = -4
+; X64-NEXT:  .Lalloc_func$frame_escape_1 = -12
+; X64-NEXT:    movl $42, -4(%rbp)
+; X64-NEXT:    movl $13, -12(%rbp)
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    addq $15, %rax
+; X64-NEXT:    andq $-16, %rax
+; X64-NEXT:    callq __chkstk
+; X64-NEXT:    subq %rax, %rsp
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    callq print_framealloc_from_fp
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; X86-LABEL: alloc_func:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:  Lalloc_func$frame_escape_0 = -4
+; X86-NEXT:  Lalloc_func$frame_escape_1 = -12
+; X86-NEXT:    movl $13, -12(%ebp)
+; X86-NEXT:    movl $42, -4(%ebp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    addl $3, %eax
+; X86-NEXT:    andl $-4, %eax
+; X86-NEXT:    calll __chkstk
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    calll _print_framealloc_from_fp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
   %a = alloca i32
   %b = alloca i32, i32 2
   call void (...) @llvm.localescape(ptr %a, ptr %b)
@@ -100,11 +199,61 @@ define void @alloc_func(i32 %n) {
 
 ; Helper to make this a complete program so it can be compiled and tested.
 define i32 @main() {
+; X64-LABEL: main:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl $3, %ecx
+; X64-NEXT:    callq alloc_func
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; X86-LABEL: main:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $3
+; X86-NEXT:    calll _alloc_func
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
   call void @alloc_func(i32 3)
   ret i32 0
 }
 
 define void @alloc_func_no_frameaddr() {
+; X64-LABEL: alloc_func_no_frameaddr:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:  .Lalloc_func_no_frameaddr$frame_escape_0 = 36
+; X64-NEXT:  .Lalloc_func_no_frameaddr$frame_escape_1 = 32
+; X64-NEXT:    movl $42, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $13, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    callq print_framealloc_from_fp
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; X86-LABEL: alloc_func_no_frameaddr:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:  Lalloc_func_no_frameaddr$frame_escape_0 = 4
+; X86-NEXT:  Lalloc_func_no_frameaddr$frame_escape_1 = 0
+; X86-NEXT:    movl $13, (%esp)
+; X86-NEXT:    movl $42, {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl $0
+; X86-NEXT:    calll _print_framealloc_from_fp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
   %a = alloca i32
   %b = alloca i32
   call void (...) @llvm.localescape(ptr %a, ptr %b)
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll
index d7390299c7b10..73839cfef22f8 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-- -relocation-model=pic | FileCheck %s -check-prefix=PIC
 ; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s -check-prefix=STATIC
 ;
@@ -22,6 +23,51 @@
 @A = global [16 x [16 x i32]] zeroinitializer, align 32		; <ptr> [#uses=2]
 
 define void @test(i32 %row, i32 %N.in) nounwind {
+; PIC-LABEL: test:
+; PIC:       # %bb.0: # %entry
+; PIC-NEXT:    calll .L0$pb
+; PIC-NEXT:  .L0$pb:
+; PIC-NEXT:    popl %ecx
+; PIC-NEXT:  .Ltmp0:
+; PIC-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ecx
+; PIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; PIC-NEXT:    testl %eax, %eax
+; PIC-NEXT:    jle .LBB0_3
+; PIC-NEXT:  # %bb.1: # %cond_true.preheader
+; PIC-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; PIC-NEXT:    shll $6, %edx
+; PIC-NEXT:    movl A at GOT(%ecx), %ecx
+; PIC-NEXT:    leal 8(%edx,%ecx), %ecx
+; PIC-NEXT:    .p2align 4
+; PIC-NEXT:  .LBB0_2: # %cond_true
+; PIC-NEXT:    # =>This Inner Loop Header: Depth=1
+; PIC-NEXT:    movl $5, (%ecx)
+; PIC-NEXT:    movl $4, -4(%ecx)
+; PIC-NEXT:    addl $4, %ecx
+; PIC-NEXT:    decl %eax
+; PIC-NEXT:    jne .LBB0_2
+; PIC-NEXT:  .LBB0_3: # %return
+; PIC-NEXT:    retl
+;
+; STATIC-LABEL: test:
+; STATIC:       # %bb.0: # %entry
+; STATIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; STATIC-NEXT:    testl %eax, %eax
+; STATIC-NEXT:    jle .LBB0_3
+; STATIC-NEXT:  # %bb.1: # %cond_true.preheader
+; STATIC-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; STATIC-NEXT:    shll $6, %ecx
+; STATIC-NEXT:    leal A+8(%ecx), %ecx
+; STATIC-NEXT:    .p2align 4
+; STATIC-NEXT:  .LBB0_2: # %cond_true
+; STATIC-NEXT:    # =>This Inner Loop Header: Depth=1
+; STATIC-NEXT:    movl $5, (%ecx)
+; STATIC-NEXT:    movl $4, -4(%ecx)
+; STATIC-NEXT:    addl $4, %ecx
+; STATIC-NEXT:    decl %eax
+; STATIC-NEXT:    jne .LBB0_2
+; STATIC-NEXT:  .LBB0_3: # %return
+; STATIC-NEXT:    retl
 entry:
 	%N = bitcast i32 %N.in to i32		; <i32> [#uses=1]
 	%tmp5 = icmp sgt i32 %N.in, 0		; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll
index 558a4e9fb0864..fa3323ec736b3 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-3.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
 
 ; CHECK: align
@@ -10,6 +11,25 @@
 @A = global [16 x [16 x i32]] zeroinitializer, align 32		; <ptr> [#uses=2]
 
 define void @test(i32 %row, i32 %N.in) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jle LBB0_3
+; CHECK-NEXT:  ## %bb.1: ## %cond_true.preheader
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    shll $6, %ecx
+; CHECK-NEXT:    leal _A+8(%ecx), %ecx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  LBB0_2: ## %cond_true
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl $5, (%ecx)
+; CHECK-NEXT:    movl $4, -4(%ecx)
+; CHECK-NEXT:    addl $4, %ecx
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    jne LBB0_2
+; CHECK-NEXT:  LBB0_3: ## %return
+; CHECK-NEXT:    retl
 entry:
 	%N = bitcast i32 %N.in to i32		; <i32> [#uses=1]
 	%tmp5 = icmp sgt i32 %N.in, 0		; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce.ll b/llvm/test/CodeGen/X86/loop-strength-reduce.ll
index a8c28b7d16dce..ba56dd8f17392 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s
 
 ; CHECK: align
@@ -10,6 +11,25 @@
 @A = internal global [16 x [16 x i32]] zeroinitializer, align 32		; <ptr> [#uses=2]
 
 define void @test(i32 %row, i32 %N.in) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jle .LBB0_3
+; CHECK-NEXT:  # %bb.1: # %cond_true.preheader
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    shll $6, %ecx
+; CHECK-NEXT:    leal A+8(%ecx), %ecx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_2: # %cond_true
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl $5, (%ecx)
+; CHECK-NEXT:    movl $4, -4(%ecx)
+; CHECK-NEXT:    addl $4, %ecx
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %return
+; CHECK-NEXT:    retl
 entry:
 	%N = bitcast i32 %N.in to i32		; <i32> [#uses=1]
 	%tmp5 = icmp sgt i32 %N.in, 0		; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
index 08003739b55d0..0273081862eb8 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
@@ -18,8 +18,8 @@ define void @foo(i32 %N) nounwind {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movw %cx, X
 ; CHECK-NEXT:    movw %dx, Y
-; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    addl $4, %edx
+; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    cmpl %ecx, %eax
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: # %return
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce7.ll b/llvm/test/CodeGen/X86/loop-strength-reduce7.ll
index 11473bacc56cf..d1d532c12239e 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce7.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce7.ll
@@ -23,14 +23,14 @@ define fastcc void @outer_loop(ptr nocapture %gfp, ptr nocapture %xr, i32 %targ_
 ; CHECK-NEXT:  LBB0_3: ## %bb29.i38
 ; CHECK-NEXT:    ## Parent Loop BB0_2 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    incl %edx
 ; CHECK-NEXT:    addl $12, %esi
+; CHECK-NEXT:    incl %edx
 ; CHECK-NEXT:    cmpl $11, %edx
 ; CHECK-NEXT:    jbe LBB0_3
 ; CHECK-NEXT:  ## %bb.1: ## %bb28.i37.loopexit
 ; CHECK-NEXT:    ## in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    addl $4, %eax
 ; CHECK-NEXT:    addl $168, %ecx
+; CHECK-NEXT:    addl $4, %eax
 ; CHECK-NEXT:    jmp LBB0_2
 entry:
 	br label %bb4
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i32.ll b/llvm/test/CodeGen/X86/lrint-conv-i32.ll
index f4cb0d3ff87e6..322257d8b234d 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i32.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i32.ll
@@ -145,15 +145,21 @@ define i32 @test_lrint_i32_f128(fp128 %x) nounwind {
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $16, %esp
-; X86-NOSSE-NEXT:    pushl 20(%ebp)
-; X86-NOSSE-NEXT:    pushl 16(%ebp)
-; X86-NOSSE-NEXT:    pushl 12(%ebp)
-; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    movl 16(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 20(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 12(%ebp), %esi
+; X86-NOSSE-NEXT:    pushl %ecx
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    pushl %edx
 ; X86-NOSSE-NEXT:    calll lrintl
 ; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -161,15 +167,21 @@ define i32 @test_lrint_i32_f128(fp128 %x) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl 16(%ebp), %eax
+; X86-SSE2-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -343,15 +355,21 @@ define i32 @test_lrint_i32_f128_strict(fp128 %x) nounwind strictfp {
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $16, %esp
-; X86-NOSSE-NEXT:    pushl 20(%ebp)
-; X86-NOSSE-NEXT:    pushl 16(%ebp)
-; X86-NOSSE-NEXT:    pushl 12(%ebp)
-; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    movl 16(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 20(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 12(%ebp), %esi
+; X86-NOSSE-NEXT:    pushl %ecx
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    pushl %edx
 ; X86-NOSSE-NEXT:    calll lrintl
 ; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
+; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
@@ -359,15 +377,21 @@ define i32 @test_lrint_i32_f128_strict(fp128 %x) nounwind strictfp {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl 16(%ebp), %eax
+; X86-SSE2-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i64.ll b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
index c45918ea4d5ee..897bf29143dc7 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
@@ -72,7 +72,8 @@ define i64 @test_lrint_i64_f32(float %x) nounwind {
 ;
 ; X86-NOX87-LABEL: test_lrint_i64_f32:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
@@ -123,8 +124,10 @@ define i64 @test_lrint_i64_f64(double %x) nounwind {
 ;
 ; X86-NOX87-LABEL: test_lrint_i64_f64:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    retl
@@ -176,9 +179,11 @@ define i64 @test_lrint_i64_f80(x86_fp80 %x) nounwind {
 ; X86-NOX87-LABEL: test_lrint_i64_f80:
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOX87-NEXT:    pushl %eax
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    pushl %edx
+; X86-NOX87-NEXT:    pushl %ecx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $12, %esp
 ; X86-NOX87-NEXT:    retl
@@ -199,15 +204,21 @@ define i64 @test_lrint_i64_f128(fp128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll lrintl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -215,15 +226,21 @@ define i64 @test_lrint_i64_f128(fp128 %x) nounwind {
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
+; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %ebp, %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -293,7 +310,8 @@ define i64 @test_lrint_i64_f32_strict(float %x) nounwind {
 ;
 ; X86-NOX87-LABEL: test_lrint_i64_f32_strict:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
@@ -329,8 +347,10 @@ define i64 @test_lrint_i64_f64_strict(double %x) nounwind {
 ;
 ; X86-NOX87-LABEL: test_lrint_i64_f64_strict:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    retl
@@ -367,9 +387,11 @@ define i64 @test_lrint_i64_f80_strict(x86_fp80 %x) nounwind {
 ; X86-NOX87-LABEL: test_lrint_i64_f80_strict:
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOX87-NEXT:    pushl %eax
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    pushl %edx
+; X86-NOX87-NEXT:    pushl %ecx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $12, %esp
 ; X86-NOX87-NEXT:    retl
@@ -392,15 +414,21 @@ define i64 @test_lrint_i64_f128_strict(fp128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll lrintl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -408,15 +436,21 @@ define i64 @test_lrint_i64_f128_strict(fp128 %x) nounwind {
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
+; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %ebp, %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -446,7 +480,8 @@ define i32 @PR125324(float %x) nounwind {
 ;
 ; X86-NOX87-LABEL: PR125324:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/lround-conv-i32.ll b/llvm/test/CodeGen/X86/lround-conv-i32.ll
index 4e95da14b45a1..3cd3fc6c37dc1 100644
--- a/llvm/test/CodeGen/X86/lround-conv-i32.ll
+++ b/llvm/test/CodeGen/X86/lround-conv-i32.ll
@@ -163,15 +163,21 @@ define i32 @test_lround_i32_f128(fp128 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll lroundl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/lround-conv-i64.ll b/llvm/test/CodeGen/X86/lround-conv-i64.ll
index 0002bb8936d5d..def176a842a3d 100644
--- a/llvm/test/CodeGen/X86/lround-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lround-conv-i64.ll
@@ -193,15 +193,21 @@ define i64 @test_lround_i64_f128(fp128 %x) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll lroundl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/lwp-intrinsics.ll b/llvm/test/CodeGen/X86/lwp-intrinsics.ll
index 6f32b09c838f0..a99864ed127cc 100644
--- a/llvm/test/CodeGen/X86/lwp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/lwp-intrinsics.ll
@@ -43,18 +43,18 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
 ; X86_LWP-LABEL: test_lwpins32_rri:
 ; X86_LWP:       # %bb.0:
 ; X86_LWP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_LWP-NEXT:    addl %eax, %eax
 ; X86_LWP-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_LWP-NEXT:    addl %ecx, %ecx
-; X86_LWP-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_LWP-NEXT:    lwpins $-1985229329, %eax, %ecx # imm = 0x89ABCDEF
 ; X86_LWP-NEXT:    setb %al
 ; X86_LWP-NEXT:    retl
 ;
 ; X86_BDVER-LABEL: test_lwpins32_rri:
 ; X86_BDVER:       # %bb.0:
-; X86_BDVER-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86_BDVER-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER-NEXT:    addl %ecx, %ecx
-; X86_BDVER-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER-NEXT:    addl %eax, %eax
+; X86_BDVER-NEXT:    lwpins $-1985229329, %eax, %ecx # imm = 0x89ABCDEF
 ; X86_BDVER-NEXT:    setb %al
 ; X86_BDVER-NEXT:    retl
 ;
@@ -92,17 +92,17 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
 ; X86_LWP-LABEL: test_lwpval32_rri:
 ; X86_LWP:       # %bb.0:
 ; X86_LWP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_LWP-NEXT:    addl %eax, %eax
 ; X86_LWP-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_LWP-NEXT:    addl %ecx, %ecx
-; X86_LWP-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_LWP-NEXT:    lwpval $-19088744, %eax, %ecx # imm = 0xFEDCBA98
 ; X86_LWP-NEXT:    retl
 ;
 ; X86_BDVER-LABEL: test_lwpval32_rri:
 ; X86_BDVER:       # %bb.0:
-; X86_BDVER-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86_BDVER-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_BDVER-NEXT:    addl %ecx, %ecx
-; X86_BDVER-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER-NEXT:    addl %eax, %eax
+; X86_BDVER-NEXT:    lwpval $-19088744, %eax, %ecx # imm = 0xFEDCBA98
 ; X86_BDVER-NEXT:    retl
 ;
 ; X64-LABEL: test_lwpval32_rri:
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index d0a46388b7e76..a1b5693d895ac 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -177,8 +177,8 @@ define void @test5(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) !prof
 ;
 ; X86-LABEL: test5:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %k1, %k2
 ; X86-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
 ; X86-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
@@ -222,9 +222,9 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; X86-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X86-KNL-NEXT:    movw $255, %ax
 ; X86-KNL-NEXT:    kmovw %eax, %k1
+; X86-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vpgatherdd (,%zmm1), %zmm2 {%k2}
 ; X86-KNL-NEXT:    vpscatterdd %zmm0, (,%zmm1) {%k1}
@@ -274,17 +274,17 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
 ; X86-KNL-LABEL: test7:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    kmovw %ecx, %k0
+; X86-KNL-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    kmovw %eax, %k0
 ; X86-KNL-NEXT:    kshiftlw $8, %k0, %k0
 ; X86-KNL-NEXT:    kshiftrw $8, %k0, %k1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
-; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm2
-; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
-; X86-KNL-NEXT:    vpaddd %ymm1, %ymm2, %ymm0
+; X86-KNL-NEXT:    vmovdqa64 %zmm1, %zmm2
+; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
+; X86-KNL-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-LABEL: test7:
@@ -385,17 +385,17 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ;
 ; X86-KNL-LABEL: test9:
 ; X86-KNL:       # %bb.0: # %entry
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
-; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
-; X86-KNL-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [80,80,80,80,80,80,80,80]
+; X86-KNL-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X86-KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
-; X86-KNL-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
+; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [820,820,820,820,820,820,820,820]
+; X86-KNL-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
 ; X86-KNL-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; X86-KNL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    movw $255, %ax
 ; X86-KNL-NEXT:    kmovw %eax, %k1
+; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vpgatherdd 68(,%zmm1), %zmm0 {%k1}
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-KNL-NEXT:    retl
@@ -469,17 +469,17 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ;
 ; X86-KNL-LABEL: test10:
 ; X86-KNL:       # %bb.0: # %entry
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
-; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
-; X86-KNL-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [80,80,80,80,80,80,80,80]
+; X86-KNL-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X86-KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
-; X86-KNL-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
+; X86-KNL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [820,820,820,820,820,820,820,820]
+; X86-KNL-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
 ; X86-KNL-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; X86-KNL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    movw $255, %ax
 ; X86-KNL-NEXT:    kmovw %eax, %k1
+; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vpgatherdd 68(,%zmm1), %zmm0 {%k1}
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-KNL-NEXT:    retl
@@ -692,11 +692,11 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
 ;
 ; X86-KNL-LABEL: test15:
 ; X86-KNL:       # %bb.0:
-; X86-KNL-NEXT:    vpslld $31, %xmm1, %xmm1
-; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm2
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
+; X86-KNL-NEXT:    vpslld $31, %xmm1, %xmm0
+; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    # implicit-def: $xmm0
@@ -714,22 +714,22 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
 ; X86-KNL-NEXT:  .LBB14_1: # %cond.load
-; X86-KNL-NEXT:    vmovd %xmm1, %ecx
+; X86-KNL-NEXT:    vmovd %xmm2, %ecx
 ; X86-KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-KNL-NEXT:    testb $2, %al
 ; X86-KNL-NEXT:    je .LBB14_4
 ; X86-KNL-NEXT:  .LBB14_3: # %cond.load1
-; X86-KNL-NEXT:    vpextrd $1, %xmm1, %ecx
+; X86-KNL-NEXT:    vpextrd $1, %xmm2, %ecx
 ; X86-KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-KNL-NEXT:    testb $4, %al
 ; X86-KNL-NEXT:    je .LBB14_6
 ; X86-KNL-NEXT:  .LBB14_5: # %cond.load4
-; X86-KNL-NEXT:    vpextrd $2, %xmm1, %ecx
+; X86-KNL-NEXT:    vpextrd $2, %xmm2, %ecx
 ; X86-KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X86-KNL-NEXT:    testb $8, %al
 ; X86-KNL-NEXT:    je .LBB14_8
 ; X86-KNL-NEXT:  .LBB14_7: # %cond.load7
-; X86-KNL-NEXT:    vpextrd $3, %xmm1, %eax
+; X86-KNL-NEXT:    vpextrd $3, %xmm2, %eax
 ; X86-KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
@@ -808,11 +808,11 @@ define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x doub
 ;
 ; X86-KNL-LABEL: test16:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; X86-KNL-NEXT:    vpslld $31, %xmm1, %xmm1
 ; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; X86-KNL-NEXT:    vpslld $3, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB15_1
@@ -909,11 +909,11 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
 ;
 ; X86-KNL-LABEL: test17:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; X86-KNL-NEXT:    vpslld $3, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB16_1
@@ -966,9 +966,9 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
 ;
 ; X86-SKX-LABEL: test17:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpslld $3, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
+; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vmovmskpd %xmm1, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB16_1
@@ -1142,23 +1142,23 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
 ; X86-KNL-LABEL: test19:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
+; X86-KNL-NEXT:    vpmovqd %zmm2, %ymm2
+; X86-KNL-NEXT:    vpslld $3, %xmm2, %xmm2
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
 ; X86-KNL-NEXT:    vpslld $31, %xmm1, %xmm1
 ; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; X86-KNL-NEXT:    vpmovqd %zmm2, %ymm1
-; X86-KNL-NEXT:    vpslld $3, %xmm1, %xmm1
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm2
-; X86-KNL-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    je .LBB18_2
 ; X86-KNL-NEXT:  # %bb.1: # %cond.store
-; X86-KNL-NEXT:    vmovd %xmm1, %ecx
+; X86-KNL-NEXT:    vmovd %xmm2, %ecx
 ; X86-KNL-NEXT:    vmovlps %xmm0, (%ecx)
 ; X86-KNL-NEXT:  .LBB18_2: # %else
 ; X86-KNL-NEXT:    testb $2, %al
 ; X86-KNL-NEXT:    je .LBB18_4
 ; X86-KNL-NEXT:  # %bb.3: # %cond.store1
-; X86-KNL-NEXT:    vpextrd $1, %xmm1, %ecx
+; X86-KNL-NEXT:    vpextrd $1, %xmm2, %ecx
 ; X86-KNL-NEXT:    vmovhps %xmm0, (%ecx)
 ; X86-KNL-NEXT:  .LBB18_4: # %else2
 ; X86-KNL-NEXT:    testb $4, %al
@@ -1171,12 +1171,12 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
 ; X86-KNL-NEXT:  .LBB18_5: # %cond.store3
-; X86-KNL-NEXT:    vpextrd $2, %xmm1, %ecx
+; X86-KNL-NEXT:    vpextrd $2, %xmm2, %ecx
 ; X86-KNL-NEXT:    vmovlps %xmm0, (%ecx)
 ; X86-KNL-NEXT:    testb $8, %al
 ; X86-KNL-NEXT:    je .LBB18_8
 ; X86-KNL-NEXT:  .LBB18_7: # %cond.store5
-; X86-KNL-NEXT:    vpextrd $3, %xmm1, %eax
+; X86-KNL-NEXT:    vpextrd $3, %xmm2, %eax
 ; X86-KNL-NEXT:    vmovhps %xmm0, (%eax)
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
@@ -1430,11 +1430,11 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
 ;
 ; X86-KNL-LABEL: test22:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB21_1
@@ -1489,9 +1489,9 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
 ;
 ; X86-SKX-LABEL: test22:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
+; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vmovmskpd %xmm1, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB21_1
@@ -1552,12 +1552,12 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
 ;
 ; X86-KNL-LABEL: test22a:
 ; X86-KNL:       # %bb.0:
-; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
-; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; X86-KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
+; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB22_1
@@ -1611,10 +1611,10 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
 ;
 ; X86-SKX-LABEL: test22a:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
+; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vmovmskpd %xmm1, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB22_1
@@ -1677,11 +1677,11 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
 ;
 ; X86-KNL-LABEL: test23:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB23_1
@@ -1734,9 +1734,9 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
 ;
 ; X86-SKX-LABEL: test23:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
+; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vmovmskpd %xmm1, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB23_1
@@ -1795,12 +1795,12 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
 ;
 ; X86-KNL-LABEL: test23b:
 ; X86-KNL:       # %bb.0:
-; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
-; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; X86-KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
+; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB24_1
@@ -1852,10 +1852,10 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
 ;
 ; X86-SKX-LABEL: test23b:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
+; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vmovmskpd %xmm1, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB24_1
@@ -1899,10 +1899,10 @@ define <2 x i32> @test24(ptr %base, <2 x i32> %ind) {
 ; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
 ; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; X86-KNL-NEXT:    vmovd %xmm0, %eax
-; X86-KNL-NEXT:    vpextrd $1, %xmm0, %ecx
+; X86-KNL-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-KNL-NEXT:    vmovd %xmm0, %ecx
 ; X86-KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-KNL-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
+; X86-KNL-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
 ; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-LABEL: test24:
@@ -1921,10 +1921,10 @@ define <2 x i32> @test24(ptr %base, <2 x i32> %ind) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT:    vmovd %xmm0, %eax
-; X86-SKX-NEXT:    vpextrd $1, %xmm0, %ecx
+; X86-SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-SKX-NEXT:    vmovd %xmm0, %ecx
 ; X86-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
+; X86-SKX-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
 ; X86-SKX-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr i32, ptr %base, <2 x i64> %sext_ind
@@ -1966,11 +1966,11 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
 ;
 ; X86-KNL-LABEL: test25:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; X86-KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; X86-KNL-NEXT:    vpslld $3, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB26_1
@@ -2025,9 +2025,9 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
 ;
 ; X86-SKX-LABEL: test25:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpslld $3, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
+; X86-SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vmovmskpd %xmm1, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB26_1
@@ -2076,11 +2076,11 @@ define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
 ; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-KNL-NEXT:    vmovd %xmm0, %eax
-; X86-KNL-NEXT:    vpextrd $1, %xmm0, %ecx
-; X86-KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-KNL-NEXT:    vpinsrd $1, 4(%eax), %xmm0, %xmm0
-; X86-KNL-NEXT:    vpinsrd $2, (%ecx), %xmm0, %xmm0
-; X86-KNL-NEXT:    vpinsrd $3, 4(%ecx), %xmm0, %xmm0
+; X86-KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-KNL-NEXT:    vpinsrd $1, 4(%eax), %xmm1, %xmm1
+; X86-KNL-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-KNL-NEXT:    vpinsrd $2, (%eax), %xmm1, %xmm0
+; X86-KNL-NEXT:    vpinsrd $3, 4(%eax), %xmm0, %xmm0
 ; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-LABEL: test26:
@@ -2101,11 +2101,11 @@ define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; X86-SKX-NEXT:    vpslld $3, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vmovd %xmm0, %eax
-; X86-SKX-NEXT:    vpextrd $1, %xmm0, %ecx
-; X86-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vpinsrd $1, 4(%eax), %xmm0, %xmm0
-; X86-SKX-NEXT:    vpinsrd $2, (%ecx), %xmm0, %xmm0
-; X86-SKX-NEXT:    vpinsrd $3, 4(%ecx), %xmm0, %xmm0
+; X86-SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SKX-NEXT:    vpinsrd $1, 4(%eax), %xmm1, %xmm1
+; X86-SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-SKX-NEXT:    vpinsrd $2, (%eax), %xmm1, %xmm0
+; X86-SKX-NEXT:    vpinsrd $3, 4(%eax), %xmm0, %xmm0
 ; X86-SKX-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
   %gep.random = getelementptr i64, ptr %base, <2 x i64> %sext_ind
@@ -2133,8 +2133,8 @@ define <2 x float> @test27(ptr %base, <2 x i32> %ind) {
 ; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
 ; X86-KNL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; X86-KNL-NEXT:    vmovd %xmm0, %eax
-; X86-KNL-NEXT:    vpextrd $1, %xmm0, %ecx
+; X86-KNL-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-KNL-NEXT:    vmovd %xmm0, %ecx
 ; X86-KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-KNL-NEXT:    retl
@@ -2155,8 +2155,8 @@ define <2 x float> @test27(ptr %base, <2 x i32> %ind) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT:    vmovd %xmm0, %eax
-; X86-SKX-NEXT:    vpextrd $1, %xmm0, %ecx
+; X86-SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-SKX-NEXT:    vmovd %xmm0, %ecx
 ; X86-SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-SKX-NEXT:    retl
@@ -2206,10 +2206,10 @@ define <16 x float> @test29(ptr %base, <16 x i32> %ind) {
 ;
 ; X86-LABEL: test29:
 ; X86:       # %bb.0:
+; X86-NEXT:    movw $44, %ax
+; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT:    movw $44, %cx
-; X86-NEXT:    kmovw %ecx, %k1
 ; X86-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
 ; X86-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-NEXT:    retl
@@ -2571,10 +2571,10 @@ define <16 x i64> @test_gather_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i6
 ; X86-KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; X86-KNL-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-KNL-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
 ; X86-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X86-KNL-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
 ; X86-KNL-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; X86-KNL-NEXT:    movl %ebp, %esp
@@ -2606,10 +2606,10 @@ define <16 x i64> @test_gather_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i6
 ; X86-SKX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-SKX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-SKX-NEXT:    vpmovd2m %zmm1, %k1
-; X86-SKX-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-SKX-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
 ; X86-SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X86-SKX-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
 ; X86-SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; X86-SKX-NEXT:    movl %ebp, %esp
@@ -2690,10 +2690,10 @@ define <16 x double> @test_gather_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x
 ; X86-KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; X86-KNL-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-KNL-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
 ; X86-KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; X86-KNL-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
 ; X86-KNL-NEXT:    vmovapd %zmm2, %zmm0
 ; X86-KNL-NEXT:    movl %ebp, %esp
@@ -2725,10 +2725,10 @@ define <16 x double> @test_gather_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x
 ; X86-SKX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-SKX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-SKX-NEXT:    vpmovd2m %zmm1, %k1
-; X86-SKX-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-SKX-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
 ; X86-SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; X86-SKX-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
 ; X86-SKX-NEXT:    vmovapd %zmm2, %zmm0
 ; X86-SKX-NEXT:    movl %ebp, %esp
@@ -2808,10 +2808,10 @@ define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %s
 ; X86-KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; X86-KNL-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-KNL-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
 ; X86-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X86-KNL-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
 ; X86-KNL-NEXT:    movl %ebp, %esp
 ; X86-KNL-NEXT:    popl %ebp
@@ -2842,10 +2842,10 @@ define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %s
 ; X86-SKX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-SKX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-SKX-NEXT:    vpmovd2m %zmm1, %k1
-; X86-SKX-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-SKX-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
 ; X86-SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X86-SKX-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
 ; X86-SKX-NEXT:    movl %ebp, %esp
 ; X86-SKX-NEXT:    popl %ebp
@@ -2926,10 +2926,10 @@ define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double>
 ; X86-KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; X86-KNL-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-KNL-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
 ; X86-KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; X86-KNL-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-KNL-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
 ; X86-KNL-NEXT:    movl %ebp, %esp
 ; X86-KNL-NEXT:    popl %ebp
@@ -2960,10 +2960,10 @@ define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double>
 ; X86-SKX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; X86-SKX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; X86-SKX-NEXT:    vpmovd2m %zmm1, %k1
-; X86-SKX-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; X86-SKX-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
 ; X86-SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; X86-SKX-NEXT:    vmovapd 8(%ebp), %zmm1
 ; X86-SKX-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
 ; X86-SKX-NEXT:    movl %ebp, %esp
 ; X86-SKX-NEXT:    popl %ebp
@@ -3429,13 +3429,13 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
 ;
 ; X86-KNL-LABEL: large_index:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; X86-KNL-NEXT:    vpslld $2, %xmm2, %xmm2
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
 ; X86-KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; X86-KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-KNL-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; X86-KNL-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm2
-; X86-KNL-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB47_1
@@ -3447,13 +3447,13 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
 ; X86-KNL-NEXT:  .LBB47_1: # %cond.load
-; X86-KNL-NEXT:    vmovd %xmm0, %ecx
-; X86-KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-KNL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; X86-KNL-NEXT:    vmovd %xmm2, %ecx
+; X86-KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-KNL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; X86-KNL-NEXT:    testb $2, %al
 ; X86-KNL-NEXT:    je .LBB47_4
 ; X86-KNL-NEXT:  .LBB47_3: # %cond.load1
-; X86-KNL-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-KNL-NEXT:    vpextrd $1, %xmm2, %eax
 ; X86-KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; X86-KNL-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-KNL-NEXT:    vzeroupper
@@ -3492,29 +3492,28 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
 ;
 ; X86-SKX-LABEL: large_index:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; X86-SKX-NEXT:    vpslld $2, %xmm2, %xmm2
+; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm2, %xmm2
 ; X86-SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
-; X86-SKX-NEXT:    vpmovq2m %xmm0, %k0
-; X86-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; X86-SKX-NEXT:    vpslld $2, %xmm0, %xmm0
-; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; X86-SKX-NEXT:    kmovw %k0, %eax
+; X86-SKX-NEXT:    vmovmskpd %xmm0, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB47_1
 ; X86-SKX-NEXT:  # %bb.2: # %else
 ; X86-SKX-NEXT:    testb $2, %al
 ; X86-SKX-NEXT:    jne .LBB47_3
 ; X86-SKX-NEXT:  .LBB47_4: # %else2
-; X86-SKX-NEXT:    vmovaps %xmm1, %xmm0
+; X86-SKX-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-SKX-NEXT:    retl
 ; X86-SKX-NEXT:  .LBB47_1: # %cond.load
-; X86-SKX-NEXT:    vmovd %xmm0, %ecx
-; X86-SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vmovss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; X86-SKX-NEXT:    vmovd %xmm2, %ecx
+; X86-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SKX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; X86-SKX-NEXT:    testb $2, %al
 ; X86-SKX-NEXT:    je .LBB47_4
 ; X86-SKX-NEXT:  .LBB47_3: # %cond.load1
-; X86-SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-SKX-NEXT:    vpextrd $1, %xmm2, %eax
 ; X86-SKX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; X86-SKX-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-SKX-NEXT:    retl
@@ -3535,8 +3534,8 @@ define <16 x float> @sext_i8_index(ptr %base, <16 x i8> %ind) {
 ;
 ; X86-LABEL: sext_i8_index:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovsxbd %xmm0, %zmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kxnorw %k0, %k0, %k1
 ; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -3562,11 +3561,11 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ;
 ; X86-KNL-LABEL: sext_v8i8_index:
 ; X86-KNL:       # %bb.0:
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %ymm1
+; X86-KNL-NEXT:    movw $255, %ax
+; X86-KNL-NEXT:    kmovw %eax, %k1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X86-KNL-NEXT:    movw $255, %cx
-; X86-KNL-NEXT:    kmovw %ecx, %k1
 ; X86-KNL-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-KNL-NEXT:    retl
@@ -3581,8 +3580,8 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ;
 ; X86-SKX-LABEL: sext_v8i8_index:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %ymm1
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
@@ -3607,8 +3606,8 @@ define <16 x float> @zext_i8_index(ptr %base, <16 x i8> %ind) {
 ;
 ; X86-LABEL: zext_i8_index:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kxnorw %k0, %k0, %k1
 ; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -3634,11 +3633,11 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ;
 ; X86-KNL-LABEL: zext_v8i8_index:
 ; X86-KNL:       # %bb.0:
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X86-KNL-NEXT:    movw $255, %ax
+; X86-KNL-NEXT:    kmovw %eax, %k1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X86-KNL-NEXT:    movw $255, %cx
-; X86-KNL-NEXT:    kmovw %ecx, %k1
 ; X86-KNL-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-KNL-NEXT:    retl
@@ -3653,8 +3652,8 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ;
 ; X86-SKX-LABEL: zext_v8i8_index:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
@@ -3699,11 +3698,11 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
 ;
 ; X86-KNL-LABEL: test_scatter_2i32_index:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    vpslld $3, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm3
+; X86-KNL-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; X86-KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; X86-KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
-; X86-KNL-NEXT:    vpslld $3, %xmm1, %xmm1
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm2
-; X86-KNL-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB52_1
@@ -3752,9 +3751,9 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
 ;
 ; X86-SKX-LABEL: test_scatter_2i32_index:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; X86-SKX-NEXT:    vpslld $3, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
 ; X86-SKX-NEXT:    vmovmskpd %xmm2, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB52_1
@@ -3789,8 +3788,8 @@ define <16 x float> @zext_index(ptr %base, <16 x i32> %ind) {
 ;
 ; X86-KNL-LABEL: zext_index:
 ; X86-KNL:       # %bb.0:
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
 ; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -3815,8 +3814,8 @@ define <16 x float> @zext_index(ptr %base, <16 x i32> %ind) {
 ;
 ; X86-SKX-LABEL: zext_index:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; X86-SKX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -3851,14 +3850,13 @@ define <16 x double> @test_gather_setcc_split(ptr %base, <16 x i32> %ind, <16 x
 ; X86-KNL-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-KNL-NEXT:    andl $-64, %esp
 ; X86-KNL-NEXT:    subl $64, %esp
-; X86-KNL-NEXT:    vmovdqa64 %zmm1, %zmm3
-; X86-KNL-NEXT:    vmovapd 72(%ebp), %zmm1
+; X86-KNL-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; X86-KNL-NEXT:    movl 8(%ebp), %eax
-; X86-KNL-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; X86-KNL-NEXT:    vptestnmd %zmm4, %zmm4, %k1
-; X86-KNL-NEXT:    vptestnmd %zmm3, %zmm3, %k2
-; X86-KNL-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
+; X86-KNL-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
+; X86-KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; X86-KNL-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; X86-KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; X86-KNL-NEXT:    vmovapd 72(%ebp), %zmm1
 ; X86-KNL-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovapd %zmm2, %zmm0
 ; X86-KNL-NEXT:    movl %ebp, %esp
@@ -3887,16 +3885,15 @@ define <16 x double> @test_gather_setcc_split(ptr %base, <16 x i32> %ind, <16 x
 ; X86-SKX-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SKX-NEXT:    andl $-64, %esp
 ; X86-SKX-NEXT:    subl $64, %esp
-; X86-SKX-NEXT:    vmovapd 72(%ebp), %zmm3
+; X86-SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; X86-SKX-NEXT:    movl 8(%ebp), %eax
-; X86-SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X86-SKX-NEXT:    vptestnmd %ymm4, %ymm4, %k1
-; X86-SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k2
-; X86-SKX-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
+; X86-SKX-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
 ; X86-SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; X86-SKX-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
+; X86-SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; X86-SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1
+; X86-SKX-NEXT:    vmovapd 72(%ebp), %zmm1
+; X86-SKX-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1}
 ; X86-SKX-NEXT:    vmovapd %zmm2, %zmm0
-; X86-SKX-NEXT:    vmovapd %zmm3, %zmm1
 ; X86-SKX-NEXT:    movl %ebp, %esp
 ; X86-SKX-NEXT:    popl %ebp
 ; X86-SKX-NEXT:    .cfi_def_cfa %esp, 4
@@ -3930,14 +3927,14 @@ define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cm
 ; X86-KNL-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-KNL-NEXT:    andl $-64, %esp
 ; X86-KNL-NEXT:    subl $64, %esp
-; X86-KNL-NEXT:    vmovapd 72(%ebp), %zmm3
+; X86-KNL-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; X86-KNL-NEXT:    movl 8(%ebp), %eax
-; X86-KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X86-KNL-NEXT:    vptestnmd %zmm4, %zmm4, %k1
-; X86-KNL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
-; X86-KNL-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
+; X86-KNL-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
+; X86-KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; X86-KNL-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 ; X86-KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; X86-KNL-NEXT:    vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
+; X86-KNL-NEXT:    vmovapd 72(%ebp), %zmm1
+; X86-KNL-NEXT:    vscatterdpd %zmm1, (%eax,%ymm0,8) {%k1}
 ; X86-KNL-NEXT:    movl %ebp, %esp
 ; X86-KNL-NEXT:    popl %ebp
 ; X86-KNL-NEXT:    .cfi_def_cfa %esp, 4
@@ -3964,14 +3961,14 @@ define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cm
 ; X86-SKX-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SKX-NEXT:    andl $-64, %esp
 ; X86-SKX-NEXT:    subl $64, %esp
-; X86-SKX-NEXT:    vmovapd 72(%ebp), %zmm3
+; X86-SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1
 ; X86-SKX-NEXT:    movl 8(%ebp), %eax
-; X86-SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
-; X86-SKX-NEXT:    vptestnmd %ymm4, %ymm4, %k1
-; X86-SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k2
-; X86-SKX-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
+; X86-SKX-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
 ; X86-SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; X86-SKX-NEXT:    vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
+; X86-SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; X86-SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1
+; X86-SKX-NEXT:    vmovapd 72(%ebp), %zmm1
+; X86-SKX-NEXT:    vscatterdpd %zmm1, (%eax,%ymm0,8) {%k1}
 ; X86-SKX-NEXT:    movl %ebp, %esp
 ; X86-SKX-NEXT:    popl %ebp
 ; X86-SKX-NEXT:    .cfi_def_cfa %esp, 4
@@ -3999,8 +3996,8 @@ define <16 x float> @test_sext_cse(ptr %base, <16 x i32> %ind, ptr %foo) {
 ; X86-LABEL: test_sext_cse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps %zmm0, (%ecx)
+; X86-NEXT:    vmovaps %zmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kxnorw %k0, %k0, %k1
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -4061,9 +4058,9 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
 ; X86-KNL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB58_1
@@ -4138,11 +4135,10 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
 ;
 ; X86-SKX-LABEL: gather_2i64_constant_indices:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
+; X86-SKX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
-; X86-SKX-NEXT:    vpmovq2m %xmm0, %k0
-; X86-SKX-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
-; X86-SKX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; X86-SKX-NEXT:    kmovw %k0, %eax
+; X86-SKX-NEXT:    vmovmskpd %xmm0, %eax
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB58_1
@@ -4182,8 +4178,8 @@ define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) {
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
 ; X86-KNL-NEXT:    retl
@@ -4214,8 +4210,8 @@ define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) {
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -4256,9 +4252,9 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
 ; X86-KNL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-KNL-NEXT:    kmovw %k0, %eax
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB60_1
 ; X86-KNL-NEXT:  # %bb.2: # %else
@@ -4329,11 +4325,10 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
 ;
 ; X86-SKX-LABEL: scatter_2i64_constant_indices:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm2
+; X86-SKX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm2
 ; X86-SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
-; X86-SKX-NEXT:    vpmovq2m %xmm0, %k0
-; X86-SKX-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
-; X86-SKX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-SKX-NEXT:    kmovw %k0, %eax
+; X86-SKX-NEXT:    vmovmskpd %xmm0, %eax
 ; X86-SKX-NEXT:    testb $1, %al
 ; X86-SKX-NEXT:    jne .LBB60_1
 ; X86-SKX-NEXT:  # %bb.2: # %else
@@ -4342,12 +4337,12 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
 ; X86-SKX-NEXT:  .LBB60_4: # %else2
 ; X86-SKX-NEXT:    retl
 ; X86-SKX-NEXT:  .LBB60_1: # %cond.store
-; X86-SKX-NEXT:    vmovd %xmm0, %ecx
+; X86-SKX-NEXT:    vmovd %xmm2, %ecx
 ; X86-SKX-NEXT:    vmovss %xmm1, (%ecx)
 ; X86-SKX-NEXT:    testb $2, %al
 ; X86-SKX-NEXT:    je .LBB60_4
 ; X86-SKX-NEXT:  .LBB60_3: # %cond.store1
-; X86-SKX-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-SKX-NEXT:    vpextrd $1, %xmm2, %eax
 ; X86-SKX-NEXT:    vextractps $1, %xmm1, (%eax)
 ; X86-SKX-NEXT:    retl
   %gep = getelementptr i32, ptr %ptr, <2 x i64> <i64 0, i64 -2>
@@ -4371,8 +4366,8 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
@@ -4403,8 +4398,8 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
 ; X86-SKX-NEXT:    vzeroupper
 ; X86-SKX-NEXT:    retl
@@ -4459,8 +4454,8 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB62_1
 ; X86-KNL-NEXT:  # %bb.2: # %else
@@ -4511,8 +4506,8 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpmovd2m %xmm0, %k1
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
 ; X86-SKX-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-SKX-NEXT:    retl
@@ -4567,8 +4562,8 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
 ; X86-KNL-NEXT:    kmovw %k0, %eax
+; X86-KNL-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm0
 ; X86-KNL-NEXT:    testb $1, %al
 ; X86-KNL-NEXT:    jne .LBB63_1
 ; X86-KNL-NEXT:  # %bb.2: # %else
@@ -4616,8 +4611,8 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpmovd2m %xmm0, %k1
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1}
 ; X86-SKX-NEXT:    retl
   %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
@@ -4662,8 +4657,8 @@ define <8 x float> @scaleidx_x86gather_outofrange(ptr %base, <8 x i32> %index, <
 ;
 ; X86-LABEL: scaleidx_x86gather_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpslld $2, %ymm0, %ymm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vgatherdps %ymm1, (%eax,%ymm2,4), %ymm0
 ; X86-NEXT:    retl
@@ -4685,8 +4680,8 @@ define void @scaleidx_x86scatter(<16 x float> %value, ptr %base, <16 x i32> %ind
 ;
 ; X86-LABEL: scaleidx_x86scatter:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4712,12 +4707,12 @@ define void @scaleidx_scatter(<8 x float> %value, ptr %base, <8 x i32> %index, i
 ; X86-KNL-LABEL: scaleidx_scatter:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpaddd %ymm1, %ymm1, %ymm1
-; X86-KNL-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    kmovw %ecx, %k0
+; X86-KNL-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    kmovw %eax, %k0
 ; X86-KNL-NEXT:    kshiftlw $8, %k0, %k0
 ; X86-KNL-NEXT:    kshiftrw $8, %k0, %k1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
@@ -4732,8 +4727,8 @@ define void @scaleidx_scatter(<8 x float> %value, ptr %base, <8 x i32> %index, i
 ;
 ; X86-SKX-LABEL: scaleidx_scatter:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vscatterdps %ymm0, (%eax,%ymm1,8) {%k1}
 ; X86-SKX-NEXT:    vzeroupper
 ; X86-SKX-NEXT:    retl
@@ -4759,12 +4754,12 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32
 ; X86-KNL-LABEL: scaleidx_scatter_outofrange:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vpslld $2, %ymm1, %ymm1
-; X86-KNL-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    kmovw %ecx, %k0
+; X86-KNL-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    kmovw %eax, %k0
 ; X86-KNL-NEXT:    kshiftlw $8, %k0, %k0
 ; X86-KNL-NEXT:    kshiftrw $8, %k0, %k1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-KNL-NEXT:    vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
 ; X86-KNL-NEXT:    vzeroupper
 ; X86-KNL-NEXT:    retl
@@ -4779,9 +4774,9 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32
 ;
 ; X86-SKX-LABEL: scaleidx_scatter_outofrange:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpslld $2, %ymm1, %ymm1
 ; X86-SKX-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vscatterdps %ymm0, (%eax,%ymm1,4) {%k1}
 ; X86-SKX-NEXT:    vzeroupper
 ; X86-SKX-NEXT:    retl
@@ -4932,13 +4927,13 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ;
 ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vpslld $4, (%eax), %zmm2
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vpslld $4, (%ecx), %zmm0
-; X86-KNL-NEXT:    vgatherdps (%eax,%zmm0), %zmm1 {%k1}
+; X86-KNL-NEXT:    vgatherdps (%eax,%zmm2), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-KNL-NEXT:    retl
 ;
@@ -4967,13 +4962,13 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ;
 ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vpslld $4, (%eax), %zmm2
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vpslld $4, (%ecx), %zmm0
-; X86-SKX-NEXT:    vgatherdps (%eax,%zmm0), %zmm1 {%k1}
+; X86-SKX-NEXT:    vgatherdps (%eax,%zmm2), %zmm1 {%k1}
 ; X86-SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-SKX-NEXT:    retl
   %wide.load = load <16 x i32>, ptr %arr, align 4
@@ -4998,13 +4993,13 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ;
 ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_offset:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vpslld $4, (%eax), %zmm2
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vpslld $4, (%ecx), %zmm0
-; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
+; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-KNL-NEXT:    retl
 ;
@@ -5033,13 +5028,13 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ;
 ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vpslld $4, (%eax), %zmm2
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vpslld $4, (%ecx), %zmm0
-; X86-SKX-NEXT:    vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
+; X86-SKX-NEXT:    vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
 ; X86-SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-SKX-NEXT:    retl
   %wide.load = load <16 x i32>, ptr %arr, align 4
@@ -5066,14 +5061,14 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ;
 ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_pair:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vpslld $4, (%eax), %zmm2
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vpslld $4, (%ecx), %zmm2
-; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
+; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vgatherdps (%eax,%zmm2), %zmm0 {%k2}
 ; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
 ; X86-KNL-NEXT:    retl
@@ -5107,14 +5102,14 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ;
 ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vpslld $4, (%eax), %zmm2
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vpslld $4, (%ecx), %zmm2
-; X86-SKX-NEXT:    kmovw %k1, %k2
 ; X86-SKX-NEXT:    vmovaps %zmm1, %zmm0
+; X86-SKX-NEXT:    kmovw %k1, %k2
 ; X86-SKX-NEXT:    vgatherdps (%eax,%zmm2), %zmm0 {%k2}
 ; X86-SKX-NEXT:    vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -5146,14 +5141,14 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
 ; X86-KNL-LABEL: test_gather_structpt2_8f32_mask_index:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vmovdqu (%eax), %ymm2
+; X86-KNL-NEXT:    vpslld $3, %ymm2, %ymm2
 ; X86-KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vmovdqu (%ecx), %ymm0
-; X86-KNL-NEXT:    vpslld $3, %ymm0, %ymm0
-; X86-KNL-NEXT:    vgatherdps (%eax,%zmm0), %zmm1 {%k1}
+; X86-KNL-NEXT:    vgatherdps (%eax,%zmm2), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
@@ -5182,13 +5177,13 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index(ptr %x, ptr %arr, <8 x
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_8f32_mask_index:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vmovups (%eax), %ymm2
 ; X86-SKX-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X86-SKX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpmovd2m %ymm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vmovups (%ecx), %ymm0
-; X86-SKX-NEXT:    vgatherdps (%eax,%ymm0,8), %ymm1 {%k1}
+; X86-SKX-NEXT:    vgatherdps (%eax,%ymm2,8), %ymm1 {%k1}
 ; X86-SKX-NEXT:    vmovaps %ymm1, %ymm0
 ; X86-SKX-NEXT:    retl
   %wide.load = load <8 x i32>, ptr %arr, align 4
@@ -5215,14 +5210,14 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
 ; X86-KNL-LABEL: test_gather_structpt2_8f32_mask_index_offset:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vmovdqu (%eax), %ymm2
+; X86-KNL-NEXT:    vpslld $3, %ymm2, %ymm2
 ; X86-KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vmovdqu (%ecx), %ymm0
-; X86-KNL-NEXT:    vpslld $3, %ymm0, %ymm0
-; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm0), %zmm1 {%k1}
+; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovaps %ymm1, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
@@ -5251,13 +5246,13 @@ define <8 x float> @test_gather_structpt2_8f32_mask_index_offset(ptr %x, ptr %ar
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_8f32_mask_index_offset:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vmovups (%eax), %ymm2
 ; X86-SKX-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X86-SKX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpmovd2m %ymm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vmovups (%ecx), %ymm0
-; X86-SKX-NEXT:    vgatherdps 4(%eax,%ymm0,8), %ymm1 {%k1}
+; X86-SKX-NEXT:    vgatherdps 4(%eax,%ymm2,8), %ymm1 {%k1}
 ; X86-SKX-NEXT:    vmovaps %ymm1, %ymm0
 ; X86-SKX-NEXT:    retl
   %wide.load = load <8 x i32>, ptr %arr, align 4
@@ -5288,15 +5283,15 @@ define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(pt
 ; X86-KNL-LABEL: test_gather_structpt2_8f32_mask_index_pair:
 ; X86-KNL:       # %bb.0:
 ; X86-KNL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vmovdqu (%eax), %ymm2
+; X86-KNL-NEXT:    vpslld $3, %ymm2, %ymm2
 ; X86-KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vmovdqu (%ecx), %ymm0
-; X86-KNL-NEXT:    vpslld $3, %ymm0, %ymm2
-; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-KNL-NEXT:    kmovw %k1, %k2
+; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-KNL-NEXT:    vgatherdps (%eax,%zmm2), %zmm0 {%k2}
 ; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
 ; X86-KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -5332,14 +5327,14 @@ define {<8 x float>, <8 x float>} @test_gather_structpt2_8f32_mask_index_pair(pt
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_8f32_mask_index_pair:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vmovups (%eax), %ymm2
 ; X86-SKX-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X86-SKX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpmovd2m %ymm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vmovups (%ecx), %ymm2
-; X86-SKX-NEXT:    kmovw %k1, %k2
 ; X86-SKX-NEXT:    vmovaps %ymm1, %ymm0
+; X86-SKX-NEXT:    kmovw %k1, %k2
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm2,8), %ymm0 {%k2}
 ; X86-SKX-NEXT:    vgatherdps 4(%eax,%ymm2,8), %ymm1 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -5369,13 +5364,13 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vmovups (%eax), %zmm2
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vmovups (%ecx), %zmm0
-; X86-KNL-NEXT:    vgatherdps (%eax,%zmm0,8), %zmm1 {%k1}
+; X86-KNL-NEXT:    vgatherdps (%eax,%zmm2,8), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-KNL-NEXT:    retl
 ;
@@ -5404,13 +5399,13 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index(ptr %x, ptr %arr, <1
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vmovups (%eax), %zmm2
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vmovups (%ecx), %zmm0
-; X86-SKX-NEXT:    vgatherdps (%eax,%zmm0,8), %zmm1 {%k1}
+; X86-SKX-NEXT:    vgatherdps (%eax,%zmm2,8), %zmm1 {%k1}
 ; X86-SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-SKX-NEXT:    retl
   %wide.load = load <16 x i32>, ptr %arr, align 4
@@ -5435,13 +5430,13 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index_offset:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vmovups (%eax), %zmm2
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vmovups (%ecx), %zmm0
-; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm0,8), %zmm1 {%k1}
+; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm2,8), %zmm1 {%k1}
 ; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-KNL-NEXT:    retl
 ;
@@ -5470,13 +5465,13 @@ define <16 x float> @test_gather_structpt2_16f32_mask_index_offset(ptr %x, ptr %
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index_offset:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vmovups (%eax), %zmm2
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vmovups (%ecx), %zmm0
-; X86-SKX-NEXT:    vgatherdps 4(%eax,%zmm0,8), %zmm1 {%k1}
+; X86-SKX-NEXT:    vgatherdps 4(%eax,%zmm2,8), %zmm1 {%k1}
 ; X86-SKX-NEXT:    vmovaps %zmm1, %zmm0
 ; X86-SKX-NEXT:    retl
   %wide.load = load <16 x i32>, ptr %arr, align 4
@@ -5503,14 +5498,14 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
 ;
 ; X86-KNL-LABEL: test_gather_structpt2_16f32_mask_index_pair:
 ; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-KNL-NEXT:    vmovups (%eax), %zmm2
 ; X86-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-KNL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-KNL-NEXT:    vmovups (%ecx), %zmm2
-; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vmovaps %zmm1, %zmm0
+; X86-KNL-NEXT:    kmovw %k1, %k2
 ; X86-KNL-NEXT:    vgatherdps (%eax,%zmm2,8), %zmm0 {%k2}
 ; X86-KNL-NEXT:    vgatherdps 4(%eax,%zmm2,8), %zmm1 {%k1}
 ; X86-KNL-NEXT:    retl
@@ -5544,14 +5539,14 @@ define {<16 x float>, <16 x float>} @test_gather_structpt2_16f32_mask_index_pair
 ;
 ; X86-SKX-LABEL: test_gather_structpt2_16f32_mask_index_pair:
 ; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SKX-NEXT:    vmovups (%eax), %zmm2
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X86-SKX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X86-SKX-NEXT:    vpmovd2m %zmm0, %k1
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SKX-NEXT:    vmovups (%ecx), %zmm2
-; X86-SKX-NEXT:    kmovw %k1, %k2
 ; X86-SKX-NEXT:    vmovaps %zmm1, %zmm0
+; X86-SKX-NEXT:    kmovw %k1, %k2
 ; X86-SKX-NEXT:    vgatherdps (%eax,%zmm2,8), %zmm0 {%k2}
 ; X86-SKX-NEXT:    vgatherdps 4(%eax,%zmm2,8), %zmm1 {%k1}
 ; X86-SKX-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 99a8918fef93f..709f30dc71bf2 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -160,8 +160,8 @@ define <2 x double> @load_v2f64_i2(i2 %trigger, ptr %addr, <2 x double> %dst) {
 ;
 ; X86-AVX512-LABEL: load_v2f64_i2:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovupd (%eax), %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i2 %trigger to <2 x i1>
@@ -244,8 +244,8 @@ define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double
 ;
 ; X86-AVX512-LABEL: load_v2f64_v2i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmpd (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -335,8 +335,8 @@ define <4 x double> @load_v4f64_i4(i4 %trigger, ptr %addr, <4 x double> %dst) {
 ;
 ; X86-AVX512-LABEL: load_v4f64_i4:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovapd (%eax), %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i4 %trigger to <4 x i1>
@@ -420,8 +420,8 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, ptr %addr, <4 x double
 ;
 ; X86-AVX512-LABEL: load_v4f64_v4i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmpd (%eax), %ymm1, %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -502,8 +502,8 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, ptr %addr) {
 ;
 ; X86-AVX512-LABEL: load_v4f64_v4i32_zero:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovapd (%eax), %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -629,8 +629,8 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double
 ;
 ; X86-AVX512-LABEL: load_v4f64_v4i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmq %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmpd (%eax), %ymm1, %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i64> %trigger, zeroinitializer
@@ -947,8 +947,8 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x double
 ;
 ; X86-AVX512-LABEL: load_v8f64_v8i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmw %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmpd (%eax), %zmm1, %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <8 x i16> %trigger, zeroinitializer
@@ -1147,8 +1147,8 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x double
 ;
 ; X86-AVX512-LABEL: load_v8f64_v8i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmpd (%eax), %zmm1, %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <8 x i64> %trigger, zeroinitializer
@@ -1243,10 +1243,10 @@ define <2 x float> @load_v2f32_i2(i2 %trigger, ptr %addr, <2 x float> %dst) {
 ;
 ; X86-AVX512-LABEL: load_v2f32_i2:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i2 %trigger to <2 x i1>
@@ -1345,10 +1345,10 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float>
 ;
 ; X86-AVX512-LABEL: load_v2f32_v2i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmps (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1441,10 +1441,10 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, ptr %addr) {
 ;
 ; X86-AVX512-LABEL: load_v2f32_v2i32_undef:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1571,8 +1571,8 @@ define <4 x float> @load_v4f32_i4(i4 %trigger, ptr %addr, <4 x float> %dst) {
 ;
 ; X86-AVX512-LABEL: load_v4f32_i4:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i4 %trigger to <4 x i1>
@@ -1690,8 +1690,8 @@ define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x float>
 ;
 ; X86-AVX512-LABEL: load_v4f32_v4i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmps (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -2279,8 +2279,8 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x float>
 ;
 ; X86-AVX512-LABEL: load_v8f32_v8i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vblendmps (%eax), %ymm1, %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -2418,8 +2418,8 @@ define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %dst
 ;
 ; X86-AVX512-LABEL: load_v2i64_v2i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmq (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -2547,8 +2547,8 @@ define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %dst
 ;
 ; X86-AVX512-LABEL: load_v4i64_v4i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmq %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmq (%eax), %ymm1, %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i64> %trigger, zeroinitializer
@@ -2755,8 +2755,8 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i64> %dst
 ;
 ; X86-AVX512-LABEL: load_v8i64_v8i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmw %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmq (%eax), %zmm1, %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <8 x i16> %trigger, zeroinitializer
@@ -2959,8 +2959,8 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, ptr %addr, <8 x i64> %dst
 ;
 ; X86-AVX512-LABEL: load_v8i64_v8i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmq %zmm0, %zmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmq (%eax), %zmm1, %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <8 x i64> %trigger, zeroinitializer
@@ -3071,10 +3071,10 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst
 ;
 ; X86-AVX512-LABEL: load_v2i32_v2i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmd (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -3199,8 +3199,8 @@ define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst
 ;
 ; X86-AVX512-LABEL: load_v4i32_v4i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmd (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -3878,8 +3878,8 @@ define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %dst
 ;
 ; X86-AVX512-LABEL: load_v8i16_v8i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmw (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp slt <8 x i16> %trigger, zeroinitializer
@@ -4595,8 +4595,8 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16>
 ;
 ; X86-AVX512-LABEL: load_v16i16_v16i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovw2m %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmw (%eax), %ymm1, %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp slt <16 x i16> %trigger, zeroinitializer
@@ -5300,8 +5300,8 @@ define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %dst
 ;
 ; X86-AVX512-LABEL: load_v16i8_v16i8:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovb2m %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmb (%eax), %xmm1, %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp slt <16 x i8> %trigger, zeroinitializer
@@ -7086,8 +7086,8 @@ define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %dst
 ;
 ; X86-AVX512-LABEL: load_v32i8_v32i8:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovb2m %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpblendmb (%eax), %ymm1, %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp slt <32 x i8> %trigger, zeroinitializer
@@ -7149,9 +7149,9 @@ define <4 x float> @mload_constmask_v4f32(ptr %addr, <4 x float> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v4f32:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $13, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $13, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
@@ -7259,9 +7259,9 @@ define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v4i32:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $14, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $14, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovdqu32 (%eax), %xmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
@@ -7348,9 +7348,9 @@ define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v8f32:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $7, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $7, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovups (%eax), %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
@@ -7403,9 +7403,9 @@ define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v8f32_zero:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $7, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $7, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovups (%eax), %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
   %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
@@ -7451,9 +7451,9 @@ define <4 x double> @mload_constmask_v4f64(ptr %addr, <4 x double> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v4f64:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $7, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $7, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovupd (%eax), %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
@@ -7515,9 +7515,9 @@ define <8 x i32> @mload_constmask_v8i32(ptr %addr, <8 x i32> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v8i32:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $-121, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $-121, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovdqu32 (%eax), %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
@@ -7568,9 +7568,9 @@ define <4 x i64> @mload_constmask_v4i64(ptr %addr, <4 x i64> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v4i64:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $9, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $9, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovdqu64 (%eax), %ymm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
@@ -7616,9 +7616,9 @@ define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v8f64:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $-121, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $-121, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovupd (%eax), %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
@@ -7687,9 +7687,9 @@ define <16 x double> @mload_constmask_v16f64_allones_split(ptr %addr, <16 x doub
 ;
 ; X86-AVX512-LABEL: mload_constmask_v16f64_allones_split:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $85, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $85, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovupd 64(%eax), %zmm1 {%k1}
 ; X86-AVX512-NEXT:    vmovups (%eax), %zmm0
 ; X86-AVX512-NEXT:    retl
@@ -7736,9 +7736,9 @@ define <4 x double> @mload_constmask_v4f64_undef_passthrough(ptr %addr) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v4f64_undef_passthrough:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $7, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $7, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovupd (%eax), %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
@@ -7789,9 +7789,9 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(ptr %addr) {
 ;
 ; X86-AVX512-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $6, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $6, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovdqu64 (%eax), %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
@@ -7945,9 +7945,9 @@ define <8 x double> @load_one_mask_bit_set5(ptr %addr, <8 x double> %val) {
 ;
 ; X86-AVX512-LABEL: load_one_mask_bit_set5:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $-128, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $-128, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vbroadcastsd 56(%eax), %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <8 x double> @llvm.masked.load.v8f64.p0(ptr %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
@@ -8044,9 +8044,9 @@ define <16 x i64> @load_one_mask_bit_set6(ptr %addr, <16 x i64> %val) {
 ;
 ; X86-AVX512-LABEL: load_one_mask_bit_set6:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $4, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $4, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vbroadcastsd 16(%eax), %zmm0 {%k1}
 ; X86-AVX512-NEXT:    movb $36, %cl
 ; X86-AVX512-NEXT:    kmovd %ecx, %k1
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index bdbb912a71d36..0e9016d49ba65 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -142,8 +142,8 @@ define void @store_v2f64_i2(i2 %trigger, ptr %addr, <2 x double> %val) nounwind
 ;
 ; X86-AVX512-LABEL: store_v2f64_i2:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovupd %xmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i2 %trigger to <2 x i1>
@@ -202,8 +202,8 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x double> %val)
 ;
 ; X86-AVX512-LABEL: store_v2f64_v2i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovq2m %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovupd %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp slt <2 x i64> %trigger, zeroinitializer
@@ -295,8 +295,8 @@ define void @store_v4f64_i4(i4 %trigger, ptr %addr, <4 x double> %val) nounwind
 ;
 ; X86-AVX512-LABEL: store_v4f64_i4:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovupd %ymm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -411,8 +411,8 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x double> %val)
 ;
 ; X86-AVX512-LABEL: store_v4f64_v4i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovq2m %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovupd %ymm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -502,10 +502,10 @@ define void @store_v2f32_i2(i2 %trigger, ptr %addr, <2 x float> %val) nounwind {
 ;
 ; X86-AVX512-LABEL: store_v2f32_i2:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i2 %trigger to <2 x i1>
@@ -594,10 +594,10 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float> %val)
 ;
 ; X86-AVX512-LABEL: store_v2f32_v2i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -717,8 +717,8 @@ define void @store_v4f32_i4(<4 x float> %x, ptr %ptr, <4 x float> %y, i4 %trigge
 ;
 ; X86-AVX512-LABEL: store_v4f32_i4:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = bitcast i4 %trigger to <4 x i1>
@@ -828,8 +828,8 @@ define void @store_v4f32_v4i32(<4 x float> %x, ptr %ptr, <4 x float> %y, <4 x i3
 ;
 ; X86-AVX512-LABEL: store_v4f32_v4i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovd2m %xmm2, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
@@ -1015,8 +1015,8 @@ define void @store_v8f32_i8(<8 x float> %x, ptr %ptr, <8 x float> %y, i8 %trigge
 ;
 ; X86-AVX512-LABEL: store_v8f32_i8:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %ymm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -1195,8 +1195,8 @@ define void @store_v8f32_v8i32(<8 x float> %x, ptr %ptr, <8 x float> %y, <8 x i3
 ;
 ; X86-AVX512-LABEL: store_v8f32_v8i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovd2m %ymm2, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %ymm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -1658,8 +1658,8 @@ define void @store_v16f32_i16(<16 x float> %x, ptr %ptr, <16 x float> %y, i16 %t
 ;
 ; X86-AVX512-LABEL: store_v16f32_i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %zmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -1963,8 +1963,8 @@ define void @store_v16f32_v16i32(<16 x float> %x, ptr %ptr, <16 x float> %y, <16
 ;
 ; X86-AVX512-LABEL: store_v16f32_v16i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovd2m %zmm2, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %zmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -2052,8 +2052,8 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, ptr %addr, <2 x i64> %val) no
 ;
 ; X86-AVX512-LABEL: store_v2i64_v2i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovq2m %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu64 %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp slt <2 x i64> %trigger, zeroinitializer
@@ -2175,8 +2175,8 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, ptr %addr, <4 x i64> %val) no
 ;
 ; X86-AVX512-LABEL: store_v4i64_v4i64:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovq2m %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu64 %ymm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -2312,10 +2312,10 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) no
 ;
 ; X86-AVX512-LABEL: store_v2i32_v2i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k0
 ; X86-AVX512-NEXT:    kshiftlb $6, %k0, %k0
 ; X86-AVX512-NEXT:    kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu32 %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -2428,8 +2428,8 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) no
 ;
 ; X86-AVX512-LABEL: store_v4i32_v4i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu32 %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -2613,8 +2613,8 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) no
 ;
 ; X86-AVX512-LABEL: store_v8i32_v8i32:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu32 %ymm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -2964,8 +2964,8 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, ptr %addr, <8 x i16> %val) no
 ;
 ; X86-AVX512-LABEL: store_v8i16_v8i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmw %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu16 %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <8 x i16> %trigger, zeroinitializer
@@ -3728,8 +3728,8 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, ptr %addr, <16 x i16> %val
 ;
 ; X86-AVX512-LABEL: store_v16i16_v16i16:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmw %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu16 %ymm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -4330,8 +4330,8 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, ptr %addr, <16 x i8> %val) no
 ;
 ; X86-AVX512-LABEL: store_v16i8_v16i8:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmb %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu8 %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask = icmp eq <16 x i8> %trigger, zeroinitializer
@@ -5724,8 +5724,8 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, ptr %addr, <32 x i8> %val) no
 ;
 ; X86-AVX512-LABEL: store_v32i8_v32i8:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestnmb %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu8 %ymm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -5862,9 +5862,9 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16
 ;
 ; X86-AVX512-LABEL: mstore_constmask_allones_split:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movb $-37, %al
+; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movb $-37, %cl
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovdqu64 %zmm2, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vmovups %zmm3, 64(%eax)
 ; X86-AVX512-NEXT:    vzeroupper
@@ -5941,8 +5941,8 @@ define void @one_mask_bit_set3(ptr %addr, <4 x i64> %val) nounwind {
 ;
 ; X86-AVX512-LABEL: one_mask_bit_set3:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovlps %xmm0, 16(%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -5967,8 +5967,8 @@ define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) nounwind {
 ;
 ; X86-AVX512-LABEL: one_mask_bit_set4:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovhps %xmm0, 24(%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -6000,8 +6000,8 @@ define void @one_mask_bit_set5(ptr %addr, <8 x double> %val) nounwind {
 ;
 ; X86-AVX512-LABEL: one_mask_bit_set5:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovlps %xmm0, 48(%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -6053,8 +6053,8 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind {
 ;
 ; X86-AVX512-LABEL: one_mask_bit_set6:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovlps %xmm0, 48(%eax)
 ; X86-AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; X86-AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -6113,9 +6113,9 @@ define void @top_bits_unset_stack() nounwind {
 ; X86-AVX512-LABEL: top_bits_unset_stack:
 ; X86-AVX512:       ## %bb.0: ## %entry
 ; X86-AVX512-NEXT:    subl $76, %esp
-; X86-AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    movb $63, %al
 ; X86-AVX512-NEXT:    kmovd %eax, %k1
+; X86-AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vmovupd %zmm0, (%esp) {%k1}
 ; X86-AVX512-NEXT:    addl $76, %esp
 ; X86-AVX512-NEXT:    vzeroupper
@@ -6211,9 +6211,9 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, ptr %p, <
 ;
 ; X86-AVX512-LABEL: masked_store_bool_mask_demand_trunc_sext:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vpmovd2m %xmm1, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovupd %ymm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -6319,8 +6319,8 @@ define void @one_mask_bit_set1_variable(ptr %addr, <4 x float> %val, <4 x i32> %
 ;
 ; X86-AVX512-LABEL: one_mask_bit_set1_variable:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vptestmd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %mask_signbit = and <4 x i32> %mask, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -6692,8 +6692,8 @@ define void @PR11210(<4 x float> %x, ptr %ptr, <4 x float> %y, <2 x i64> %mask)
 ;
 ; X86-AVX512-LABEL: PR11210:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpmovd2m %xmm2, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovups %xmm1, (%eax) {%k1}
 ; X86-AVX512-NEXT:    retl
   %bc = bitcast <2 x i64> %mask to <4 x i32>
@@ -7065,18 +7065,18 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
 ;
 ; X86-AVX512-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movw $21845, %ax ## imm = 0x5555
+; X86-AVX512-NEXT:    kmovd %eax, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vpcmpgtd (%eax), %zmm0, %k1 {%k1}
+; X86-AVX512-NEXT:    movw $85, %cx
+; X86-AVX512-NEXT:    kmovd %ecx, %k2
+; X86-AVX512-NEXT:    vpcmpgtd 64(%eax), %zmm0, %k2 {%k2}
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovdqa64 (%eax), %zmm0
+; X86-AVX512-NEXT:    vmovdqa64 64(%eax), %zmm1
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vmovdqa64 (%edx), %zmm0
-; X86-AVX512-NEXT:    vmovdqa64 64(%edx), %zmm1
-; X86-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-AVX512-NEXT:    movw $21845, %dx ## imm = 0x5555
-; X86-AVX512-NEXT:    kmovd %edx, %k1
-; X86-AVX512-NEXT:    vpcmpgtd (%ecx), %zmm2, %k1 {%k1}
-; X86-AVX512-NEXT:    movw $85, %dx
-; X86-AVX512-NEXT:    kmovd %edx, %k2
-; X86-AVX512-NEXT:    vpcmpgtd 64(%ecx), %zmm2, %k2 {%k2}
 ; X86-AVX512-NEXT:    vmovdqu32 %zmm1, 64(%eax) {%k2}
 ; X86-AVX512-NEXT:    vmovdqu32 %zmm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
@@ -7281,13 +7281,13 @@ define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind {
 ;
 ; X86-AVX512-LABEL: undefshuffle:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl $15, %eax
+; X86-AVX512-NEXT:    kmovd %eax, %k0
 ; X86-AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0
-; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k0
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl $15, %ecx
-; X86-AVX512-NEXT:    kmovd %ecx, %k1
-; X86-AVX512-NEXT:    kandd %k1, %k0, %k1
+; X86-AVX512-NEXT:    vpmovw2m %xmm0, %k1
+; X86-AVX512-NEXT:    kandd %k0, %k1, %k1
 ; X86-AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqu32 %ymm0, (%eax) {%k1}
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll
index 045e8daa7597b..a917d148ec73e 100644
--- a/llvm/test/CodeGen/X86/materialize.ll
+++ b/llvm/test/CodeGen/X86/materialize.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
 ; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
@@ -10,74 +11,155 @@
 ; RUN:    | FileCheck %s --check-prefix=OPERAND64
 
 define i32 @one32_nooptsize() {
+; CHECK32-LABEL: one32_nooptsize:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl $1, %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one32_nooptsize:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $1, %eax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one32_nooptsize:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 1
 
 ; When not optimizing for size, use mov.
-; CHECK32-LABEL: one32_nooptsize:
-; CHECK32:       movl $1, %eax
-; CHECK32-NEXT:  retl
-; CHECK64-LABEL: one32_nooptsize:
-; CHECK64:       movl $1, %eax
-; CHECK64-NEXT:  retq
 }
 
 define i32 @one32() optsize {
+; CHECK32-LABEL: one32:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    incl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one32:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $1, %eax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one32:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 1
 
-; CHECK32-LABEL: one32:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
 
 ; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32:
-; CHECK64:       movl $1, %eax
-; CHECK64-NEXT:  retq
 }
 
 define i32 @one32_pgso() !prof !14 {
+; CHECK32-LABEL: one32_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    incl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one32_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $1, %eax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one32_pgso:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 1
 
-; CHECK32-LABEL: one32_pgso:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
 
 ; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32_pgso:
-; CHECK64:       movl $1, %eax
-; CHECK64-NEXT:  retq
 }
 
 define i32 @one32_minsize() minsize {
+; CHECK32-LABEL: one32_minsize:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    incl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one32_minsize:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq $1
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK64-NEXT:    popq %rax
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one32_minsize:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 1
 
 ; On 32-bit, xor-inc is preferred over push-pop.
-; CHECK32-LABEL: one32_minsize:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  retl
 
 ; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
 ; pop into a 64-bit register even when we just need 32 bits.
-; CHECK64-LABEL: one32_minsize:
-; CHECK64:       pushq $1
-; CHECK64:       .cfi_adjust_cfa_offset 8
-; CHECK64:       popq %rax
-; CHECK64:       .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT:  retq
 
 ; On Win64 we can't adjust the stack unless there's a frame pointer.
-; CHECKWIN64-LABEL: one32_minsize:
-; CHECKWIN64:       movl $1, %eax
-; CHECKWIN64-NEXT:  retq
 }
 
 define i32 @pr26023() minsize {
+; CHECK32-LABEL: pr26023:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    subl $124, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 128
+; CHECK32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl %eax, (%esp)
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    movb $-2, 119(%eax)
+; CHECK32-NEXT:    pushl $5
+; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK32-NEXT:    popl %ecx
+; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    movsbl 119(%eax), %eax
+; CHECK32-NEXT:    addl $124, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: pr26023:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; CHECK64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    movb $-2, 119(%rax)
+; CHECK64-NEXT:    movl $5, %ecx
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    movsbl 119(%rax), %eax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: pr26023:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    subq $128, %rsp
+; CHECKWIN64-NEXT:    .seh_stackalloc 128
+; CHECKWIN64-NEXT:    .seh_endprologue
+; CHECKWIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECKWIN64-NEXT:    movq %rax, (%rsp)
+; CHECKWIN64-NEXT:    #APP
+; CHECKWIN64-NEXT:    #NO_APP
+; CHECKWIN64-NEXT:    movb $-2, 119(%rax)
+; CHECKWIN64-NEXT:    movl $5, %ecx
+; CHECKWIN64-NEXT:    #APP
+; CHECKWIN64-NEXT:    #NO_APP
+; CHECKWIN64-NEXT:    movsbl 119(%rax), %eax
+; CHECKWIN64-NEXT:    .seh_startepilogue
+; CHECKWIN64-NEXT:    addq $128, %rsp
+; CHECKWIN64-NEXT:    .seh_endepilogue
+; CHECKWIN64-NEXT:    retq
+; CHECKWIN64-NEXT:    .seh_endproc
 entry:
   %x = alloca [120 x i8]
   call void asm sideeffect "", "imr,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr %x)
@@ -89,146 +171,337 @@ entry:
   ret i32 %conv
 
 ; The function writes to the redzone, so push/pop cannot be used.
-; CHECK64-LABEL: pr26023:
-; CHECK64:       movl $5, %ecx
-; CHECK64:       retq
 
 ; 32-bit X86 doesn't have a redzone.
-; CHECK32-LABEL: pr26023:
-; CHECK32:       pushl $5
-; CHECK32:       popl %ecx
-; CHECK32:       retl
 
 ; Check push/pop have implicit def/use of $esp
-; OPERAND32:      PUSH32i 5, implicit-def $esp, implicit $esp
-; OPERAND32-NEXT: CFI_INSTRUCTION adjust_cfa_offset 4
-; OPERAND32-NEXT: renamable $ecx = POP32r implicit-def $esp, implicit $esp
-; OPERAND32-NEXT: CFI_INSTRUCTION adjust_cfa_offset -4
 }
 
 
 define i64 @one64_minsize() minsize {
+; CHECK32-LABEL: one64_minsize:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    incl %eax
+; CHECK32-NEXT:    xorl %edx, %edx
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one64_minsize:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq $1
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK64-NEXT:    popq %rax
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one64_minsize:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i64 1
 ; On 64-bit we don't do xor-inc yet, so push-pop it is.
-; CHECK64-LABEL: one64_minsize:
-; CHECK64:       pushq $1
-; CHECK64:       .cfi_adjust_cfa_offset 8
-; CHECK64:       popq %rax
-; CHECK64:       .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT:  retq
 
 ; On Win64 we can't adjust the stack unless there's a frame pointer.
-; CHECKWIN64-LABEL: one64_minsize:
-; CHECKWIN64:       movl $1, %eax
-; CHECKWIN64-NEXT:  retq
 
 ; Check push/pop have implicit def/use of $rsp
-; OPERAND64:      PUSH64i32 1, implicit-def $rsp, implicit $rsp
-; OPERAND64-NEXT: CFI_INSTRUCTION adjust_cfa_offset 8
-; OPERAND64-NEXT: $rax = POP64r implicit-def $rsp, implicit $rsp
-; OPERAND64-NEXT: CFI_INSTRUCTION adjust_cfa_offset -8
-; OPERAND64-NEXT: RET 0, $rax
 }
 
 define i32 @minus_one32() optsize {
+; CHECK32-LABEL: minus_one32:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one32:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $-1, %eax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_one32:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $-1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 -1
 
-; CHECK32-LABEL: minus_one32:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
 }
 
 define i32 @minus_one32_pgso() !prof !14 {
+; CHECK32-LABEL: minus_one32_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one32_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $-1, %eax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_one32_pgso:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $-1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 -1
 
-; CHECK32-LABEL: minus_one32_pgso:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
 }
 
 define i32 @minus_one32_minsize() minsize {
+; CHECK32-LABEL: minus_one32_minsize:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one32_minsize:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq $-1
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK64-NEXT:    popq %rax
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_one32_minsize:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $-1, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 -1
 
 ; xor-dec is preferred over push-pop.
-; CHECK32-LABEL: minus_one32_minsize:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  retl
 }
 
 define i16 @one16() optsize {
+; CHECK32-LABEL: one16:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    incl %eax
+; CHECK32-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one16:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movw $1, %ax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one16:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movw $1, %ax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i16 1
 
-; CHECK32-LABEL: one16:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  # kill
-; CHECK32-NEXT:  retl
 }
 
 define i16 @minus_one16() optsize {
+; CHECK32-LABEL: minus_one16:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one16:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movw $-1, %ax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_one16:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movw $-1, %ax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i16 -1
 
-; CHECK32-LABEL: minus_one16:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  # kill
-; CHECK32-NEXT:  retl
 }
 
 define i16 @one16_pgso() !prof !14 {
+; CHECK32-LABEL: one16_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    incl %eax
+; CHECK32-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: one16_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movw $1, %ax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: one16_pgso:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movw $1, %ax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i16 1
 
-; CHECK32-LABEL: one16_pgso:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  incl %eax
-; CHECK32-NEXT:  # kill
-; CHECK32-NEXT:  retl
 }
 
 define i16 @minus_one16_pgso() !prof !14 {
+; CHECK32-LABEL: minus_one16_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one16_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movw $-1, %ax
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_one16_pgso:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movw $-1, %ax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i16 -1
 
-; CHECK32-LABEL: minus_one16_pgso:
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NEXT:  # kill
-; CHECK32-NEXT:  retl
 }
 
 define i32 @minus_five32() minsize {
+; CHECK32-LABEL: minus_five32:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl $-5
+; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK32-NEXT:    popl %eax
+; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_five32:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq $-5
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK64-NEXT:    popq %rax
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_five32:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movl $-5, %eax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i32 -5
 
-; CHECK32-LABEL: minus_five32:
-; CHECK32: pushl $-5
-; CHECK32: popl %eax
-; CHECK32: retl
 }
 
 define i64 @minus_five64() minsize {
+; CHECK32-LABEL: minus_five64:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl $-5
+; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK32-NEXT:    popl %eax
+; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK32-NEXT:    xorl %edx, %edx
+; CHECK32-NEXT:    decl %edx
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_five64:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq $-5
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset 8
+; CHECK64-NEXT:    popq %rax
+; CHECK64-NEXT:    .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: minus_five64:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    movq $-5, %rax
+; CHECKWIN64-NEXT:    retq
 entry:
   ret i64 -5
 
-; CHECK64-LABEL: minus_five64:
-; CHECK64: pushq $-5
-; CHECK64:       .cfi_adjust_cfa_offset 8
-; CHECK64: popq %rax
-; CHECK64:       .cfi_adjust_cfa_offset -8
-; CHECK64: retq
 }
 
 define i32 @rematerialize_minus_one() optsize {
+; CHECK32-LABEL: rematerialize_minus_one:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    subl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 32
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    decl %ecx
+; CHECK32-NEXT:    calll f at PLT
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    addl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: rematerialize_minus_one:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    pushq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    pushq %rax
+; CHECK64-NEXT:    .cfi_def_cfa_offset 32
+; CHECK64-NEXT:    .cfi_offset %rbx, -24
+; CHECK64-NEXT:    .cfi_offset %rbp, -16
+; CHECK64-NEXT:    movl $-1, %edi
+; CHECK64-NEXT:    callq f at PLT
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    movl $-1, %eax
+; CHECK64-NEXT:    addq $8, %rsp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    popq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: rematerialize_minus_one:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    pushq %rsi
+; CHECKWIN64-NEXT:    .seh_pushreg %rsi
+; CHECKWIN64-NEXT:    pushq %rdi
+; CHECKWIN64-NEXT:    .seh_pushreg %rdi
+; CHECKWIN64-NEXT:    pushq %rbp
+; CHECKWIN64-NEXT:    .seh_pushreg %rbp
+; CHECKWIN64-NEXT:    pushq %rbx
+; CHECKWIN64-NEXT:    .seh_pushreg %rbx
+; CHECKWIN64-NEXT:    subq $40, %rsp
+; CHECKWIN64-NEXT:    .seh_stackalloc 40
+; CHECKWIN64-NEXT:    .seh_endprologue
+; CHECKWIN64-NEXT:    movl $-1, %ecx
+; CHECKWIN64-NEXT:    callq f
+; CHECKWIN64-NEXT:    #APP
+; CHECKWIN64-NEXT:    #NO_APP
+; CHECKWIN64-NEXT:    movl $-1, %eax
+; CHECKWIN64-NEXT:    .seh_startepilogue
+; CHECKWIN64-NEXT:    addq $40, %rsp
+; CHECKWIN64-NEXT:    popq %rbx
+; CHECKWIN64-NEXT:    popq %rbp
+; CHECKWIN64-NEXT:    popq %rdi
+; CHECKWIN64-NEXT:    popq %rsi
+; CHECKWIN64-NEXT:    .seh_endepilogue
+; CHECKWIN64-NEXT:    retq
+; CHECKWIN64-NEXT:    .seh_endproc
 entry:
   ; Materialize -1 (thiscall forces it into %ecx).
   tail call x86_thiscallcc void @f(i32 -1)
@@ -240,17 +513,125 @@ entry:
   ; -1 should be re-materialized here instead of getting spilled above.
   ret i32 -1
 
-; CHECK32-LABEL: rematerialize_minus_one
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NOT:   %eax
-; CHECK32:       retl
 }
 
 define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
+; CHECK32-LABEL: rematerialize_minus_one_eflags:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    subl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 32
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    xorl %esi, %esi
+; CHECK32-NEXT:    decl %esi
+; CHECK32-NEXT:    movl %esi, %ecx
+; CHECK32-NEXT:    calll f at PLT
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    cmpl $123, {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    setne %al
+; CHECK32-NEXT:    cmovel %esi, %eax
+; CHECK32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK32-NEXT:    addl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: rematerialize_minus_one_eflags:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    pushq %r15
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    pushq %r14
+; CHECK64-NEXT:    .cfi_def_cfa_offset 32
+; CHECK64-NEXT:    pushq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 40
+; CHECK64-NEXT:    pushq %rax
+; CHECK64-NEXT:    .cfi_def_cfa_offset 48
+; CHECK64-NEXT:    .cfi_offset %rbx, -40
+; CHECK64-NEXT:    .cfi_offset %r14, -32
+; CHECK64-NEXT:    .cfi_offset %r15, -24
+; CHECK64-NEXT:    .cfi_offset %rbp, -16
+; CHECK64-NEXT:    movl %edi, %r14d
+; CHECK64-NEXT:    movl $-1, %r15d
+; CHECK64-NEXT:    movl $-1, %edi
+; CHECK64-NEXT:    callq f at PLT
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    xorl %eax, %eax
+; CHECK64-NEXT:    cmpl $123, %r14d
+; CHECK64-NEXT:    setne %al
+; CHECK64-NEXT:    cmovel %r15d, %eax
+; CHECK64-NEXT:    addq $8, %rsp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 40
+; CHECK64-NEXT:    popq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 32
+; CHECK64-NEXT:    popq %r14
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    popq %r15
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: rematerialize_minus_one_eflags:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    pushq %r15
+; CHECKWIN64-NEXT:    .seh_pushreg %r15
+; CHECKWIN64-NEXT:    pushq %r14
+; CHECKWIN64-NEXT:    .seh_pushreg %r14
+; CHECKWIN64-NEXT:    pushq %rsi
+; CHECKWIN64-NEXT:    .seh_pushreg %rsi
+; CHECKWIN64-NEXT:    pushq %rdi
+; CHECKWIN64-NEXT:    .seh_pushreg %rdi
+; CHECKWIN64-NEXT:    pushq %rbp
+; CHECKWIN64-NEXT:    .seh_pushreg %rbp
+; CHECKWIN64-NEXT:    pushq %rbx
+; CHECKWIN64-NEXT:    .seh_pushreg %rbx
+; CHECKWIN64-NEXT:    subq $40, %rsp
+; CHECKWIN64-NEXT:    .seh_stackalloc 40
+; CHECKWIN64-NEXT:    .seh_endprologue
+; CHECKWIN64-NEXT:    movl %ecx, %r14d
+; CHECKWIN64-NEXT:    movl $-1, %r15d
+; CHECKWIN64-NEXT:    movl $-1, %ecx
+; CHECKWIN64-NEXT:    callq f
+; CHECKWIN64-NEXT:    #APP
+; CHECKWIN64-NEXT:    #NO_APP
+; CHECKWIN64-NEXT:    xorl %eax, %eax
+; CHECKWIN64-NEXT:    cmpl $123, %r14d
+; CHECKWIN64-NEXT:    setne %al
+; CHECKWIN64-NEXT:    cmovel %r15d, %eax
+; CHECKWIN64-NEXT:    .seh_startepilogue
+; CHECKWIN64-NEXT:    addq $40, %rsp
+; CHECKWIN64-NEXT:    popq %rbx
+; CHECKWIN64-NEXT:    popq %rbp
+; CHECKWIN64-NEXT:    popq %rdi
+; CHECKWIN64-NEXT:    popq %rsi
+; CHECKWIN64-NEXT:    popq %r14
+; CHECKWIN64-NEXT:    popq %r15
+; CHECKWIN64-NEXT:    .seh_endepilogue
+; CHECKWIN64-NEXT:    retq
+; CHECKWIN64-NEXT:    .seh_endproc
 entry:
   ; Materialize -1 (thiscall forces it into %ecx).
   tail call x86_thiscallcc void @f(i32 -1)
@@ -267,19 +648,94 @@ entry:
   %c = select i1 %a, i32 %b, i32 -1
   ret i32 %c
 
-; CHECK32-LABEL: rematerialize_minus_one_eflags
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       cmpl
-; CHECK32:       setne
-; CHECK32-NOT:   xorl
-; CHECK32:       movl $-1
-; CHECK32:       cmov
-; CHECK32:       retl
 }
 
 define i32 @rematerialize_minus_one_pgso() !prof !14 {
+; CHECK32-LABEL: rematerialize_minus_one_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    subl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 32
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    decl %ecx
+; CHECK32-NEXT:    calll f at PLT
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    addl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: rematerialize_minus_one_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    pushq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    pushq %rax
+; CHECK64-NEXT:    .cfi_def_cfa_offset 32
+; CHECK64-NEXT:    .cfi_offset %rbx, -24
+; CHECK64-NEXT:    .cfi_offset %rbp, -16
+; CHECK64-NEXT:    movl $-1, %edi
+; CHECK64-NEXT:    callq f at PLT
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    movl $-1, %eax
+; CHECK64-NEXT:    addq $8, %rsp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    popq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: rematerialize_minus_one_pgso:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    pushq %rsi
+; CHECKWIN64-NEXT:    .seh_pushreg %rsi
+; CHECKWIN64-NEXT:    pushq %rdi
+; CHECKWIN64-NEXT:    .seh_pushreg %rdi
+; CHECKWIN64-NEXT:    pushq %rbp
+; CHECKWIN64-NEXT:    .seh_pushreg %rbp
+; CHECKWIN64-NEXT:    pushq %rbx
+; CHECKWIN64-NEXT:    .seh_pushreg %rbx
+; CHECKWIN64-NEXT:    subq $40, %rsp
+; CHECKWIN64-NEXT:    .seh_stackalloc 40
+; CHECKWIN64-NEXT:    .seh_endprologue
+; CHECKWIN64-NEXT:    movl $-1, %ecx
+; CHECKWIN64-NEXT:    callq f
+; CHECKWIN64-NEXT:    #APP
+; CHECKWIN64-NEXT:    #NO_APP
+; CHECKWIN64-NEXT:    movl $-1, %eax
+; CHECKWIN64-NEXT:    .seh_startepilogue
+; CHECKWIN64-NEXT:    addq $40, %rsp
+; CHECKWIN64-NEXT:    popq %rbx
+; CHECKWIN64-NEXT:    popq %rbp
+; CHECKWIN64-NEXT:    popq %rdi
+; CHECKWIN64-NEXT:    popq %rsi
+; CHECKWIN64-NEXT:    .seh_endepilogue
+; CHECKWIN64-NEXT:    retq
+; CHECKWIN64-NEXT:    .seh_endproc
 entry:
   ; Materialize -1 (thiscall forces it into %ecx).
   tail call x86_thiscallcc void @f(i32 -1)
@@ -291,17 +747,125 @@ entry:
   ; -1 should be re-materialized here instead of getting spilled above.
   ret i32 -1
 
-; CHECK32-LABEL: rematerialize_minus_one_pgso
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       xorl %eax, %eax
-; CHECK32-NEXT:  decl %eax
-; CHECK32-NOT:   %eax
-; CHECK32:       retl
 }
 
 define i32 @rematerialize_minus_one_eflags_pgso(i32 %x) !prof !14 {
+; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    subl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 32
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    xorl %esi, %esi
+; CHECK32-NEXT:    decl %esi
+; CHECK32-NEXT:    movl %esi, %ecx
+; CHECK32-NEXT:    calll f at PLT
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    cmpl $123, {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    setne %al
+; CHECK32-NEXT:    cmovel %esi, %eax
+; CHECK32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK32-NEXT:    addl $12, %esp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: rematerialize_minus_one_eflags_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    pushq %r15
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    pushq %r14
+; CHECK64-NEXT:    .cfi_def_cfa_offset 32
+; CHECK64-NEXT:    pushq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 40
+; CHECK64-NEXT:    pushq %rax
+; CHECK64-NEXT:    .cfi_def_cfa_offset 48
+; CHECK64-NEXT:    .cfi_offset %rbx, -40
+; CHECK64-NEXT:    .cfi_offset %r14, -32
+; CHECK64-NEXT:    .cfi_offset %r15, -24
+; CHECK64-NEXT:    .cfi_offset %rbp, -16
+; CHECK64-NEXT:    movl %edi, %r14d
+; CHECK64-NEXT:    movl $-1, %r15d
+; CHECK64-NEXT:    movl $-1, %edi
+; CHECK64-NEXT:    callq f at PLT
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    xorl %eax, %eax
+; CHECK64-NEXT:    cmpl $123, %r14d
+; CHECK64-NEXT:    setne %al
+; CHECK64-NEXT:    cmovel %r15d, %eax
+; CHECK64-NEXT:    addq $8, %rsp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 40
+; CHECK64-NEXT:    popq %rbx
+; CHECK64-NEXT:    .cfi_def_cfa_offset 32
+; CHECK64-NEXT:    popq %r14
+; CHECK64-NEXT:    .cfi_def_cfa_offset 24
+; CHECK64-NEXT:    popq %r15
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbp
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    retq
+;
+; CHECKWIN64-LABEL: rematerialize_minus_one_eflags_pgso:
+; CHECKWIN64:       # %bb.0: # %entry
+; CHECKWIN64-NEXT:    pushq %r15
+; CHECKWIN64-NEXT:    .seh_pushreg %r15
+; CHECKWIN64-NEXT:    pushq %r14
+; CHECKWIN64-NEXT:    .seh_pushreg %r14
+; CHECKWIN64-NEXT:    pushq %rsi
+; CHECKWIN64-NEXT:    .seh_pushreg %rsi
+; CHECKWIN64-NEXT:    pushq %rdi
+; CHECKWIN64-NEXT:    .seh_pushreg %rdi
+; CHECKWIN64-NEXT:    pushq %rbp
+; CHECKWIN64-NEXT:    .seh_pushreg %rbp
+; CHECKWIN64-NEXT:    pushq %rbx
+; CHECKWIN64-NEXT:    .seh_pushreg %rbx
+; CHECKWIN64-NEXT:    subq $40, %rsp
+; CHECKWIN64-NEXT:    .seh_stackalloc 40
+; CHECKWIN64-NEXT:    .seh_endprologue
+; CHECKWIN64-NEXT:    movl %ecx, %r14d
+; CHECKWIN64-NEXT:    movl $-1, %r15d
+; CHECKWIN64-NEXT:    movl $-1, %ecx
+; CHECKWIN64-NEXT:    callq f
+; CHECKWIN64-NEXT:    #APP
+; CHECKWIN64-NEXT:    #NO_APP
+; CHECKWIN64-NEXT:    xorl %eax, %eax
+; CHECKWIN64-NEXT:    cmpl $123, %r14d
+; CHECKWIN64-NEXT:    setne %al
+; CHECKWIN64-NEXT:    cmovel %r15d, %eax
+; CHECKWIN64-NEXT:    .seh_startepilogue
+; CHECKWIN64-NEXT:    addq $40, %rsp
+; CHECKWIN64-NEXT:    popq %rbx
+; CHECKWIN64-NEXT:    popq %rbp
+; CHECKWIN64-NEXT:    popq %rdi
+; CHECKWIN64-NEXT:    popq %rsi
+; CHECKWIN64-NEXT:    popq %r14
+; CHECKWIN64-NEXT:    popq %r15
+; CHECKWIN64-NEXT:    .seh_endepilogue
+; CHECKWIN64-NEXT:    retq
+; CHECKWIN64-NEXT:    .seh_endproc
 entry:
   ; Materialize -1 (thiscall forces it into %ecx).
   tail call x86_thiscallcc void @f(i32 -1)
@@ -318,16 +882,6 @@ entry:
   %c = select i1 %a, i32 %b, i32 -1
   ret i32 %c
 
-; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso
-; CHECK32:       xorl %ecx, %ecx
-; CHECK32-NEXT:  decl %ecx
-; CHECK32:       calll
-; CHECK32:       cmpl
-; CHECK32:       setne
-; CHECK32-NOT:   xorl
-; CHECK32:       movl $-1
-; CHECK32:       cmov
-; CHECK32:       retl
 }
 
 declare x86_thiscallcc void @f(i32)
@@ -348,3 +902,6 @@ declare x86_thiscallcc void @f(i32)
 !12 = !{i32 999000, i64 100, i32 1}
 !13 = !{i32 999999, i64 1, i32 2}
 !14 = !{!"function_entry_count", i64 0}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; OPERAND32: {{.*}}
+; OPERAND64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/mcu-abi.ll b/llvm/test/CodeGen/X86/mcu-abi.ll
index 53c228943d914..df9fcc1c7c333 100644
--- a/llvm/test/CodeGen/X86/mcu-abi.ll
+++ b/llvm/test/CodeGen/X86/mcu-abi.ll
@@ -102,12 +102,18 @@ define i32 @test_lib_args(float %a, float %b) #0 {
 define i32 @test_fp128(ptr %ptr) #0 {
 ; CHECK-LABEL: test_fp128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl 12(%eax)
-; CHECK-NEXT:    pushl 8(%eax)
-; CHECK-NEXT:    pushl 4(%eax)
-; CHECK-NEXT:    pushl (%eax)
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl 12(%eax), %ecx
+; CHECK-NEXT:    movl 8(%eax), %edx
+; CHECK-NEXT:    movl (%eax), %esi
+; CHECK-NEXT:    movl 4(%eax), %eax
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    calll __fixtfsi
 ; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
   %v = load fp128, ptr %ptr
   %ret = fptosi fp128 %v to i32
diff --git a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll b/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
index c16e2adb7a078..506e35ce8e664 100644
--- a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
+++ b/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
@@ -10,12 +10,12 @@ define zeroext i1 @opeq1(
 ; X86-LABEL: opeq1:
 ; X86:       # %bb.0: # %"entry+land.rhs.i"
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
index 200a8184d4bd1..4cc2230eed2ee 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
@@ -12,9 +12,11 @@ declare dso_local i32 @memcmp(ptr, ptr, i32)
 define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -26,9 +28,9 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length2_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -51,9 +53,11 @@ define i1 @length2_eq_const(ptr %X) nounwind minsize {
 define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length2_eq_nobuiltin_attr:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -67,9 +71,11 @@ define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
 define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length3:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -80,9 +86,11 @@ define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length3_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -96,9 +104,11 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
 define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length4:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $4
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -110,9 +120,9 @@ define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length4_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
@@ -135,9 +145,11 @@ define i1 @length4_eq_const(ptr %X) nounwind minsize {
 define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length5:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -148,9 +160,11 @@ define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length5_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -164,9 +178,11 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
 define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -177,9 +193,11 @@ define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length8_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -193,9 +211,10 @@ define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length8_eq_const(ptr %X) nounwind minsize {
 ; X86-LABEL: length8_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $8
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -209,9 +228,11 @@ define i1 @length8_eq_const(ptr %X) nounwind minsize {
 define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length12_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -225,9 +246,11 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
 define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length12:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -240,9 +263,11 @@ define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
 define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -253,9 +278,11 @@ define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
 ; X86-NOSSE-LABEL: length16_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -265,8 +292,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
 ; X86-SSE2-LABEL: length16_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
@@ -281,9 +308,10 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
 define i1 @length16_eq_const(ptr %X) nounwind minsize {
 ; X86-NOSSE-LABEL: length16_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $16
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -309,9 +337,11 @@ define i1 @length16_eq_const(ptr %X) nounwind minsize {
 define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length24:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -322,9 +352,11 @@ define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
 ; X86-LABEL: length24_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -338,9 +370,10 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
 define i1 @length24_eq_const(ptr %X) nounwind minsize {
 ; X86-LABEL: length24_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $24
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -354,9 +387,11 @@ define i1 @length24_eq_const(ptr %X) nounwind minsize {
 define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -369,9 +404,11 @@ define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
 ; X86-LABEL: length32_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -385,9 +422,10 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
 define i1 @length32_eq_const(ptr %X) nounwind minsize {
 ; X86-LABEL: length32_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $32
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -401,9 +439,11 @@ define i1 @length32_eq_const(ptr %X) nounwind minsize {
 define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
 ; X86-LABEL: length64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -414,9 +454,11 @@ define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
 define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
 ; X86-LABEL: length64_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -430,9 +472,10 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
 define i1 @length64_eq_const(ptr %X) nounwind minsize {
 ; X86-LABEL: length64_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $64
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index 7d1422d3c961e..fa758bdeccdf2 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -45,13 +45,13 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -62,9 +62,9 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -76,13 +76,13 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -96,13 +96,13 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_gt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
 ; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
@@ -127,9 +127,11 @@ define i1 @length2_eq_const(ptr %X) nounwind {
 define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_eq_nobuiltin_attr:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -146,15 +148,15 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl (%eax), %esi
 ; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    cmpw %dx, %si
 ; X86-NEXT:    jne .LBB9_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -172,13 +174,13 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length3_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorw (%edx), %cx
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    xorb 2(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orw %cx, %ax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
@@ -190,11 +192,11 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
@@ -208,9 +210,9 @@ define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
@@ -222,11 +224,11 @@ define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
@@ -239,11 +241,11 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4_gt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    retl
@@ -270,15 +272,15 @@ define i32 @length5(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    jne .LBB16_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -296,13 +298,13 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length5_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    xorb 4(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
@@ -316,15 +318,15 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    jne .LBB18_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    jmp .LBB18_2
 ; X86-NEXT:  .LBB18_3: # %res_block
@@ -345,25 +347,25 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB19_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB19_3
 ; X86-NEXT:  .LBB19_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB19_3: # %endblock
@@ -377,12 +379,12 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length7_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 3(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 3(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
@@ -394,25 +396,25 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length7_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB21_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB21_3
 ; X86-NEXT:  .LBB21_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB21_3: # %endblock
@@ -429,25 +431,25 @@ define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB22_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB22_3
 ; X86-NEXT:  .LBB22_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB22_3: # %endblock
@@ -461,12 +463,12 @@ define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length8_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
@@ -478,10 +480,10 @@ define i1 @length8_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length8_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    movl $926299444, %ecx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    movl $858927408, %edx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -495,16 +497,16 @@ define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzbl 8(%ecx), %ecx
-; X86-NEXT:    xorb 8(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl 4(%esi), %edx
+; X86-NEXT:    xorl (%esi), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movzbl 8(%eax), %eax
+; X86-NEXT:    xorb 8(%esi), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -518,16 +520,16 @@ define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzwl 8(%ecx), %ecx
-; X86-NEXT:    xorw 8(%eax), %cx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl 4(%esi), %edx
+; X86-NEXT:    xorl (%esi), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movzwl 8(%eax), %eax
+; X86-NEXT:    xorw 8(%esi), %ax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -541,15 +543,15 @@ define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 7(%ecx), %ecx
-; X86-NEXT:    xorl 7(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl 4(%esi), %edx
+; X86-NEXT:    xorl (%esi), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl 7(%eax), %eax
+; X86-NEXT:    xorl 7(%esi), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -563,15 +565,15 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %ecx
-; X86-NEXT:    xorl 8(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl 4(%esi), %edx
+; X86-NEXT:    xorl (%esi), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl 8(%eax), %eax
+; X86-NEXT:    xorl 8(%esi), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -584,32 +586,32 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length12:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB29_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB29_3
 ; X86-NEXT:  # %bb.2: # %loadbb2
 ; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 8(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB29_4
 ; X86-NEXT:  .LBB29_3: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB29_4: # %endblock
@@ -624,18 +626,18 @@ define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movzbl 12(%edx), %edx
-; X86-NEXT:    xorb 12(%ecx), %dl
-; X86-NEXT:    movzbl %dl, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl 4(%esi), %edx
+; X86-NEXT:    xorl (%esi), %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl 8(%esi), %edx
+; X86-NEXT:    movzbl 12(%ecx), %ecx
+; X86-NEXT:    xorb 12(%esi), %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
@@ -650,18 +652,18 @@ define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movzwl 12(%edx), %edx
-; X86-NEXT:    xorw 12(%ecx), %dx
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl 4(%esi), %edx
+; X86-NEXT:    xorl (%esi), %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl 8(%esi), %edx
+; X86-NEXT:    movzwl 12(%ecx), %ecx
+; X86-NEXT:    xorw 12(%esi), %cx
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
@@ -675,19 +677,19 @@ define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length15_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 11(%eax), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movl 11(%edx), %edx
-; X86-NEXT:    xorl 11(%ecx), %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    xorl 11(%edx), %ecx
+; X86-NEXT:    movl 8(%eax), %esi
+; X86-NEXT:    xorl 8(%edx), %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -702,39 +704,39 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    jne .LBB33_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    jne .LBB33_4
 ; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    movl 8(%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl 8(%eax), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    jne .LBB33_4
 ; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl 12(%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl 12(%eax), %esi
+; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    je .LBB33_5
 ; X86-NEXT:  .LBB33_4: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB33_5: # %endblock
@@ -748,19 +750,19 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length16_eq:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl 12(%eax), %ecx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl (%edx), %esi
-; X86-NOSSE-NEXT:    movl 4(%edx), %eax
-; X86-NOSSE-NEXT:    xorl (%ecx), %esi
-; X86-NOSSE-NEXT:    xorl 4(%ecx), %eax
-; X86-NOSSE-NEXT:    orl %esi, %eax
-; X86-NOSSE-NEXT:    movl 8(%edx), %esi
-; X86-NOSSE-NEXT:    xorl 8(%ecx), %esi
-; X86-NOSSE-NEXT:    movl 12(%edx), %edx
-; X86-NOSSE-NEXT:    xorl 12(%ecx), %edx
-; X86-NOSSE-NEXT:    orl %esi, %edx
-; X86-NOSSE-NEXT:    orl %eax, %edx
+; X86-NOSSE-NEXT:    xorl 12(%edx), %ecx
+; X86-NOSSE-NEXT:    movl 8(%eax), %esi
+; X86-NOSSE-NEXT:    xorl 8(%edx), %esi
+; X86-NOSSE-NEXT:    orl %ecx, %esi
+; X86-NOSSE-NEXT:    movl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl 4(%eax), %eax
+; X86-NOSSE-NEXT:    xorl 4(%edx), %eax
+; X86-NOSSE-NEXT:    xorl (%edx), %ecx
+; X86-NOSSE-NEXT:    orl %eax, %ecx
+; X86-NOSSE-NEXT:    orl %esi, %ecx
 ; X86-NOSSE-NEXT:    setne %al
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    retl
@@ -768,19 +770,19 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE1-LABEL: length16_eq:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl 12(%eax), %ecx
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    movl (%edx), %esi
-; X86-SSE1-NEXT:    movl 4(%edx), %eax
-; X86-SSE1-NEXT:    xorl (%ecx), %esi
-; X86-SSE1-NEXT:    xorl 4(%ecx), %eax
-; X86-SSE1-NEXT:    orl %esi, %eax
-; X86-SSE1-NEXT:    movl 8(%edx), %esi
-; X86-SSE1-NEXT:    xorl 8(%ecx), %esi
-; X86-SSE1-NEXT:    movl 12(%edx), %edx
-; X86-SSE1-NEXT:    xorl 12(%ecx), %edx
-; X86-SSE1-NEXT:    orl %esi, %edx
-; X86-SSE1-NEXT:    orl %eax, %edx
+; X86-SSE1-NEXT:    xorl 12(%edx), %ecx
+; X86-SSE1-NEXT:    movl 8(%eax), %esi
+; X86-SSE1-NEXT:    xorl 8(%edx), %esi
+; X86-SSE1-NEXT:    orl %ecx, %esi
+; X86-SSE1-NEXT:    movl (%eax), %ecx
+; X86-SSE1-NEXT:    movl 4(%eax), %eax
+; X86-SSE1-NEXT:    xorl 4(%edx), %eax
+; X86-SSE1-NEXT:    xorl (%edx), %ecx
+; X86-SSE1-NEXT:    orl %eax, %ecx
+; X86-SSE1-NEXT:    orl %esi, %ecx
 ; X86-SSE1-NEXT:    setne %al
 ; X86-SSE1-NEXT:    popl %esi
 ; X86-SSE1-NEXT:    retl
@@ -788,8 +790,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length16_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
@@ -800,8 +802,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length16_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
 ; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
@@ -816,39 +818,39 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length16_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    jne .LBB35_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    jne .LBB35_4
 ; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    movl 8(%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    movl 8(%eax), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    jne .LBB35_4
 ; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl 12(%edx), %ecx
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl 12(%eax), %esi
+; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    je .LBB35_5
 ; X86-NEXT:  .LBB35_4: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB35_5: # %endblock
@@ -866,42 +868,42 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%edx), %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    jne .LBB36_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    movl 4(%edx), %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    jne .LBB36_4
 ; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl 8(%edx), %ecx
+; X86-NEXT:    movl 8(%edx), %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl 8(%ecx), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    jne .LBB36_4
 ; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl 12(%edx), %ecx
+; X86-NEXT:    movl 12(%edx), %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl 12(%ecx), %esi
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    je .LBB36_5
 ; X86-NEXT:  .LBB36_4: # %res_block
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    setae %dl
-; X86-NEXT:    leal -1(%edx,%edx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    setae %cl
+; X86-NEXT:    leal -1(%ecx,%ecx), %ecx
 ; X86-NEXT:  .LBB36_5: # %endblock
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -915,15 +917,15 @@ define i1 @length16_eq_const(ptr %X) nounwind {
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NOSSE-NEXT:    xorl (%eax), %ecx
-; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
+; X86-NOSSE-NEXT:    movl $892613426, %ecx # imm = 0x35343332
+; X86-NOSSE-NEXT:    xorl 12(%eax), %ecx
+; X86-NOSSE-NEXT:    movl $825243960, %edx # imm = 0x31303938
+; X86-NOSSE-NEXT:    xorl 8(%eax), %edx
 ; X86-NOSSE-NEXT:    orl %ecx, %edx
-; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
-; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
+; X86-NOSSE-NEXT:    movl $926299444, %ecx # imm = 0x37363534
+; X86-NOSSE-NEXT:    xorl 4(%eax), %ecx
+; X86-NOSSE-NEXT:    movl $858927408, %esi # imm = 0x33323130
+; X86-NOSSE-NEXT:    xorl (%eax), %esi
 ; X86-NOSSE-NEXT:    orl %ecx, %esi
 ; X86-NOSSE-NEXT:    orl %edx, %esi
 ; X86-NOSSE-NEXT:    sete %al
@@ -934,15 +936,15 @@ define i1 @length16_eq_const(ptr %X) nounwind {
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    pushl %esi
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-SSE1-NEXT:    xorl (%eax), %ecx
-; X86-SSE1-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-SSE1-NEXT:    xorl 4(%eax), %edx
+; X86-SSE1-NEXT:    movl $892613426, %ecx # imm = 0x35343332
+; X86-SSE1-NEXT:    xorl 12(%eax), %ecx
+; X86-SSE1-NEXT:    movl $825243960, %edx # imm = 0x31303938
+; X86-SSE1-NEXT:    xorl 8(%eax), %edx
 ; X86-SSE1-NEXT:    orl %ecx, %edx
-; X86-SSE1-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-SSE1-NEXT:    xorl 8(%eax), %ecx
-; X86-SSE1-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-SSE1-NEXT:    xorl 12(%eax), %esi
+; X86-SSE1-NEXT:    movl $926299444, %ecx # imm = 0x37363534
+; X86-SSE1-NEXT:    xorl 4(%eax), %ecx
+; X86-SSE1-NEXT:    movl $858927408, %esi # imm = 0x33323130
+; X86-SSE1-NEXT:    xorl (%eax), %esi
 ; X86-SSE1-NEXT:    orl %ecx, %esi
 ; X86-SSE1-NEXT:    orl %edx, %esi
 ; X86-SSE1-NEXT:    sete %al
@@ -977,9 +979,11 @@ define i1 @length16_eq_const(ptr %X) nounwind {
 define i32 @length24(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length24:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -990,9 +994,11 @@ define i32 @length24(ptr %X, ptr %Y) nounwind {
 define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length24_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1001,9 +1007,11 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length24_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1013,9 +1021,9 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length24_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
@@ -1029,9 +1037,9 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length24_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
@@ -1048,9 +1056,11 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 define i1 @length24_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length24_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1064,9 +1074,11 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind {
 define i1 @length24_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length24_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1080,9 +1092,10 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
 define i1 @length24_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length24_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $24
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1091,9 +1104,10 @@ define i1 @length24_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length24_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $24
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1132,9 +1146,11 @@ define i1 @length24_eq_const(ptr %X) nounwind {
 define i32 @length31(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length31:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1145,9 +1161,11 @@ define i32 @length31(ptr %X, ptr %Y) nounwind {
 define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length31_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1156,9 +1174,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length31_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1168,9 +1188,9 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length31_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
@@ -1184,9 +1204,9 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length31_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
@@ -1203,9 +1223,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 define i1 @length31_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length31_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1219,9 +1241,11 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind {
 define i1 @length31_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length31_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1235,9 +1259,11 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind {
 define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
 ; X86-NOSSE-LABEL: length31_eq_prefer128:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1246,9 +1272,11 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ;
 ; X86-SSE1-LABEL: length31_eq_prefer128:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1258,9 +1286,9 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE2-LABEL: length31_eq_prefer128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
@@ -1274,9 +1302,9 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE41-LABEL: length31_eq_prefer128:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
@@ -1293,9 +1321,10 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 define i1 @length31_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length31_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $31
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1304,9 +1333,10 @@ define i1 @length31_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length31_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $31
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1345,9 +1375,11 @@ define i1 @length31_eq_const(ptr %X) nounwind {
 define i32 @length32(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1360,9 +1392,11 @@ define i32 @length32(ptr %X, ptr %Y) nounwind {
 define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length32_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1371,9 +1405,11 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length32_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1383,9 +1419,9 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length32_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
@@ -1399,9 +1435,9 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length32_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
@@ -1418,9 +1454,11 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 define i1 @length32_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length32_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1434,9 +1472,11 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind {
 define i1 @length32_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length32_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1450,9 +1490,11 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind {
 define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
 ; X86-NOSSE-LABEL: length32_eq_prefer128:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1461,9 +1503,11 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ;
 ; X86-SSE1-LABEL: length32_eq_prefer128:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1473,9 +1517,9 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE2-LABEL: length32_eq_prefer128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
@@ -1489,9 +1533,9 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE41-LABEL: length32_eq_prefer128:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
@@ -1508,9 +1552,10 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 define i1 @length32_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length32_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $32
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1519,9 +1564,10 @@ define i1 @length32_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length32_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $32
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1560,9 +1606,11 @@ define i1 @length32_eq_const(ptr %X) nounwind {
 define i32 @length48(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length48:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1573,9 +1621,11 @@ define i32 @length48(ptr %X, ptr %Y) nounwind {
 define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length48_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1584,9 +1634,11 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length48_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1596,16 +1648,16 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length48_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
@@ -1616,16 +1668,16 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length48_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE41-NEXT:    por %xmm0, %xmm2
 ; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
@@ -1639,9 +1691,11 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 define i1 @length48_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length48_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1655,9 +1709,11 @@ define i1 @length48_lt(ptr %x, ptr %y) nounwind {
 define i1 @length48_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length48_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1671,9 +1727,11 @@ define i1 @length48_gt(ptr %x, ptr %y) nounwind {
 define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
 ; X86-NOSSE-LABEL: length48_eq_prefer128:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1682,9 +1740,11 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ;
 ; X86-SSE1-LABEL: length48_eq_prefer128:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1694,16 +1754,16 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE2-LABEL: length48_eq_prefer128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
@@ -1714,16 +1774,16 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE41-LABEL: length48_eq_prefer128:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE41-NEXT:    por %xmm0, %xmm2
 ; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
@@ -1737,9 +1797,10 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 define i1 @length48_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length48_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $48
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1748,9 +1809,10 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length48_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $48
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1762,13 +1824,13 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    retl
@@ -1778,13 +1840,13 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    por %xmm0, %xmm1
+; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
@@ -1795,9 +1857,11 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 define i32 @length63(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length63:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1808,9 +1872,11 @@ define i32 @length63(ptr %X, ptr %Y) nounwind {
 define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length63_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1819,9 +1885,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length63_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1830,48 +1898,48 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE2-LABEL: length63_eq:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
 ; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE41-LABEL: length63_eq:
 ; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm0
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    por %xmm1, %xmm2
 ; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
 ; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm3
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm3, %xmm0
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
@@ -1882,9 +1950,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 define i1 @length63_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length63_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1898,9 +1968,11 @@ define i1 @length63_lt(ptr %x, ptr %y) nounwind {
 define i1 @length63_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length63_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1914,9 +1986,10 @@ define i1 @length63_gt(ptr %x, ptr %y) nounwind {
 define i1 @length63_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length63_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $63
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1925,9 +1998,10 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length63_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $63
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1937,17 +2011,17 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ; X86-SSE2-LABEL: length63_eq_const:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
@@ -1956,17 +2030,17 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ; X86-SSE41-LABEL: length63_eq_const:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    por %xmm0, %xmm1
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
 ; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X86-SSE41-NEXT:    sete %al
 ; X86-SSE41-NEXT:    retl
@@ -1978,9 +2052,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 define i32 @length64(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1991,9 +2067,11 @@ define i32 @length64(ptr %X, ptr %Y) nounwind {
 define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length64_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -2002,9 +2080,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length64_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -2013,48 +2093,48 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE2-LABEL: length64_eq:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
 ; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE41-LABEL: length64_eq:
 ; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm0
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    por %xmm1, %xmm2
 ; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
 ; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm3
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm3, %xmm0
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
@@ -2065,9 +2145,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 define i1 @length64_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length64_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2081,9 +2163,11 @@ define i1 @length64_lt(ptr %x, ptr %y) nounwind {
 define i1 @length64_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length64_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2097,9 +2181,10 @@ define i1 @length64_gt(ptr %x, ptr %y) nounwind {
 define i1 @length64_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length64_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $64
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -2108,9 +2193,10 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length64_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $64
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -2120,17 +2206,17 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ; X86-SSE2-LABEL: length64_eq_const:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
@@ -2139,17 +2225,17 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ; X86-SSE41-LABEL: length64_eq_const:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    por %xmm0, %xmm1
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
 ; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X86-SSE41-NEXT:    sete %al
 ; X86-SSE41-NEXT:    retl
@@ -2161,9 +2247,11 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 define i32 @length96(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length96:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2174,9 +2262,11 @@ define i32 @length96(ptr %X, ptr %Y) nounwind {
 define i1 @length96_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length96_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2190,9 +2280,11 @@ define i1 @length96_eq(ptr %x, ptr %y) nounwind {
 define i1 @length96_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length96_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2206,9 +2298,11 @@ define i1 @length96_lt(ptr %x, ptr %y) nounwind {
 define i1 @length96_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length96_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2222,9 +2316,10 @@ define i1 @length96_gt(ptr %x, ptr %y) nounwind {
 define i1 @length96_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length96_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $96
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2238,9 +2333,11 @@ define i1 @length96_eq_const(ptr %X) nounwind {
 define i32 @length127(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length127:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2251,9 +2348,11 @@ define i32 @length127(ptr %X, ptr %Y) nounwind {
 define i1 @length127_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length127_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2267,9 +2366,11 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind {
 define i1 @length127_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length127_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2283,9 +2384,11 @@ define i1 @length127_lt(ptr %x, ptr %y) nounwind {
 define i1 @length127_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length127_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2299,9 +2402,10 @@ define i1 @length127_gt(ptr %x, ptr %y) nounwind {
 define i1 @length127_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length127_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $127
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2315,9 +2419,11 @@ define i1 @length127_eq_const(ptr %X) nounwind {
 define i32 @length128(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2328,9 +2434,11 @@ define i32 @length128(ptr %X, ptr %Y) nounwind {
 define i1 @length128_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length128_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2344,9 +2452,11 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind {
 define i1 @length128_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length128_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2360,9 +2470,11 @@ define i1 @length128_lt(ptr %x, ptr %y) nounwind {
 define i1 @length128_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length128_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2376,9 +2488,10 @@ define i1 @length128_gt(ptr %x, ptr %y) nounwind {
 define i1 @length128_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length128_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $128
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2392,9 +2505,11 @@ define i1 @length128_eq_const(ptr %X) nounwind {
 define i32 @length192(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length192:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2405,9 +2520,11 @@ define i32 @length192(ptr %X, ptr %Y) nounwind {
 define i1 @length192_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length192_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2421,9 +2538,11 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind {
 define i1 @length192_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length192_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2437,9 +2556,11 @@ define i1 @length192_lt(ptr %x, ptr %y) nounwind {
 define i1 @length192_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length192_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2453,9 +2574,10 @@ define i1 @length192_gt(ptr %x, ptr %y) nounwind {
 define i1 @length192_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length192_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $192
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2469,9 +2591,11 @@ define i1 @length192_eq_const(ptr %X) nounwind {
 define i32 @length255(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length255:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2482,9 +2606,11 @@ define i32 @length255(ptr %X, ptr %Y) nounwind {
 define i1 @length255_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length255_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2498,9 +2624,11 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind {
 define i1 @length255_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length255_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2514,9 +2642,11 @@ define i1 @length255_lt(ptr %x, ptr %y) nounwind {
 define i1 @length255_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length255_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2530,9 +2660,10 @@ define i1 @length255_gt(ptr %x, ptr %y) nounwind {
 define i1 @length255_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length255_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $255
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2546,9 +2677,11 @@ define i1 @length255_eq_const(ptr %X) nounwind {
 define i32 @length256(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2559,9 +2692,11 @@ define i32 @length256(ptr %X, ptr %Y) nounwind {
 define i1 @length256_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length256_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2575,9 +2710,11 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind {
 define i1 @length256_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length256_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2591,9 +2728,11 @@ define i1 @length256_lt(ptr %x, ptr %y) nounwind {
 define i1 @length256_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length256_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2607,9 +2746,10 @@ define i1 @length256_gt(ptr %x, ptr %y) nounwind {
 define i1 @length256_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length256_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $256 # imm = 0x100
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2623,9 +2763,11 @@ define i1 @length256_eq_const(ptr %X) nounwind {
 define i32 @length384(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length384:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2636,9 +2778,11 @@ define i32 @length384(ptr %X, ptr %Y) nounwind {
 define i1 @length384_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length384_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2652,9 +2796,11 @@ define i1 @length384_eq(ptr %x, ptr %y) nounwind {
 define i1 @length384_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length384_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2668,9 +2814,11 @@ define i1 @length384_lt(ptr %x, ptr %y) nounwind {
 define i1 @length384_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length384_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2684,9 +2832,10 @@ define i1 @length384_gt(ptr %x, ptr %y) nounwind {
 define i1 @length384_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length384_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $384 # imm = 0x180
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2700,9 +2849,11 @@ define i1 @length384_eq_const(ptr %X) nounwind {
 define i32 @length511(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length511:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2713,9 +2864,11 @@ define i32 @length511(ptr %X, ptr %Y) nounwind {
 define i1 @length511_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length511_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2729,9 +2882,11 @@ define i1 @length511_eq(ptr %x, ptr %y) nounwind {
 define i1 @length511_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length511_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2745,9 +2900,11 @@ define i1 @length511_lt(ptr %x, ptr %y) nounwind {
 define i1 @length511_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length511_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2761,9 +2918,10 @@ define i1 @length511_gt(ptr %x, ptr %y) nounwind {
 define i1 @length511_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length511_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2777,9 +2935,11 @@ define i1 @length511_eq_const(ptr %X) nounwind {
 define i32 @length512(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length512:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2790,9 +2950,11 @@ define i32 @length512(ptr %X, ptr %Y) nounwind {
 define i1 @length512_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length512_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2806,9 +2968,11 @@ define i1 @length512_eq(ptr %x, ptr %y) nounwind {
 define i1 @length512_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length512_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2822,9 +2986,11 @@ define i1 @length512_lt(ptr %x, ptr %y) nounwind {
 define i1 @length512_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length512_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2838,9 +3004,10 @@ define i1 @length512_gt(ptr %x, ptr %y) nounwind {
 define i1 @length512_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length512_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $512 # imm = 0x200
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2855,9 +3022,11 @@ define i1 @length512_eq_const(ptr %X) nounwind {
 define i32 @huge_length(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: huge_length:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2868,9 +3037,11 @@ define i32 @huge_length(ptr %X, ptr %Y) nounwind {
 define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: huge_length_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2893,9 +3064,12 @@ define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
 define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
 ; X86-LABEL: nonconst_length_eq:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 09f02c3f56346..6d6262050106b 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -14,13 +14,13 @@ define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -31,9 +31,9 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length2_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -56,9 +56,11 @@ define i1 @length2_eq_const(ptr %X) nounwind optsize {
 define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length2_eq_nobuiltin_attr:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -75,15 +77,15 @@ define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl (%eax), %esi
 ; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    cmpw %dx, %si
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    jmp .LBB4_2
 ; X86-NEXT:  .LBB4_3: # %res_block
@@ -101,13 +103,13 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length3_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorw (%edx), %cx
+; X86-NEXT:    movb 2(%eax), %al
+; X86-NEXT:    xorb 2(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orw %cx, %ax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
@@ -119,11 +121,11 @@ define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
@@ -137,9 +139,9 @@ define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length4_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
@@ -165,15 +167,15 @@ define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    jne .LBB9_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    jmp .LBB9_2
 ; X86-NEXT:  .LBB9_3: # %res_block
@@ -191,13 +193,13 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length5_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    movb 4(%eax), %al
+; X86-NEXT:    xorb 4(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
@@ -209,25 +211,25 @@ define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB11_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB11_3
 ; X86-NEXT:  .LBB11_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB11_3: # %endblock
@@ -241,12 +243,12 @@ define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length8_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
@@ -258,10 +260,10 @@ define i1 @length8_eq_const(ptr %X) nounwind optsize {
 ; X86-LABEL: length8_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    movl $926299444, %ecx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    movl $858927408, %edx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -273,9 +275,11 @@ define i1 @length8_eq_const(ptr %X) nounwind optsize {
 define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length12_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -289,9 +293,11 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
 define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length12:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -304,9 +310,11 @@ define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
 define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -317,9 +325,11 @@ define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
 define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-NOSSE-LABEL: length16_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -329,8 +339,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-SSE2-LABEL: length16_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
@@ -345,9 +355,10 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
 define i1 @length16_eq_const(ptr %X) nounwind optsize {
 ; X86-NOSSE-LABEL: length16_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $16
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -373,9 +384,11 @@ define i1 @length16_eq_const(ptr %X) nounwind optsize {
 define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length24:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -386,9 +399,11 @@ define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
 define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-NOSSE-LABEL: length24_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -398,9 +413,9 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-SSE2-LABEL: length24_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
@@ -418,9 +433,10 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
 define i1 @length24_eq_const(ptr %X) nounwind optsize {
 ; X86-NOSSE-LABEL: length24_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $24
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -447,9 +463,11 @@ define i1 @length24_eq_const(ptr %X) nounwind optsize {
 define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -462,9 +480,11 @@ define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
 define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-NOSSE-LABEL: length32_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -474,9 +494,9 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-SSE2-LABEL: length32_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
@@ -494,9 +514,10 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
 define i1 @length32_eq_const(ptr %X) nounwind optsize {
 ; X86-NOSSE-LABEL: length32_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $32
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -523,9 +544,11 @@ define i1 @length32_eq_const(ptr %X) nounwind optsize {
 define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -536,9 +559,11 @@ define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
 define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
 ; X86-LABEL: length64_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -552,9 +577,10 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
 define i1 @length64_eq_const(ptr %X) nounwind optsize {
 ; X86-LABEL: length64_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $64
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -568,11 +594,11 @@ define i1 @length64_eq_const(ptr %X) nounwind optsize {
 define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: bcmp_length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    cmpw (%edx), %cx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index 1b3fd6d4ddd3b..6fe2da9fa3bf0 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -14,13 +14,13 @@ define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -31,9 +31,9 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length2_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -56,9 +56,11 @@ define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
 define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length2_eq_nobuiltin_attr:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -75,15 +77,15 @@ define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl (%eax), %esi
 ; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    cmpw %dx, %si
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    jmp .LBB4_2
 ; X86-NEXT:  .LBB4_3: # %res_block
@@ -101,13 +103,13 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length3_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorw (%edx), %cx
+; X86-NEXT:    movb 2(%eax), %al
+; X86-NEXT:    xorb 2(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orw %cx, %ax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
@@ -119,11 +121,11 @@ define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
@@ -137,9 +139,9 @@ define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length4_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
@@ -165,15 +167,15 @@ define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    jne .LBB9_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    jmp .LBB9_2
 ; X86-NEXT:  .LBB9_3: # %res_block
@@ -191,13 +193,13 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length5_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    movb 4(%eax), %al
+; X86-NEXT:    xorb 4(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
@@ -209,25 +211,25 @@ define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB11_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB11_3
 ; X86-NEXT:  .LBB11_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB11_3: # %endblock
@@ -241,12 +243,12 @@ define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length8_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
@@ -258,10 +260,10 @@ define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
 ; X86-LABEL: length8_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    movl $926299444, %ecx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    movl $858927408, %edx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -273,9 +275,11 @@ define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
 define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length12_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -289,9 +293,11 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length12:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -304,9 +310,11 @@ define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
 define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -317,9 +325,11 @@ define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
 define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-NOSSE-LABEL: length16_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -329,8 +339,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-SSE2-LABEL: length16_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
@@ -345,9 +355,10 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
 define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
 ; X86-NOSSE-LABEL: length16_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $16
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -373,9 +384,11 @@ define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
 define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length24:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -386,9 +399,11 @@ define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
 define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-NOSSE-LABEL: length24_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -398,9 +413,9 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-SSE2-LABEL: length24_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
@@ -418,9 +433,10 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
 define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
 ; X86-NOSSE-LABEL: length24_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $24
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -447,9 +463,11 @@ define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
 define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -462,9 +480,11 @@ define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
 define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-NOSSE-LABEL: length32_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -474,9 +494,9 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-SSE2-LABEL: length32_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
@@ -494,9 +514,10 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
 define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
 ; X86-NOSSE-LABEL: length32_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $32
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -523,9 +544,11 @@ define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
 define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -536,9 +559,11 @@ define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
 define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
 ; X86-LABEL: length64_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -552,9 +577,10 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
 define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
 ; X86-LABEL: length64_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $64
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -568,11 +594,11 @@ define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
 define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: bcmp_length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw (%ecx), %dx
+; X86-NEXT:    cmpw (%edx), %cx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index 28e732be9191d..a5064bcdb0127 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -44,13 +44,13 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -90,9 +90,9 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -104,13 +104,13 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -124,13 +124,13 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_gt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
 ; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
@@ -155,9 +155,11 @@ define i1 @length2_eq_const(ptr %X) nounwind {
 define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_eq_nobuiltin_attr:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -174,15 +176,15 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    movzwl (%ecx), %edx
 ; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl (%eax), %esi
 ; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    cmpw %dx, %si
 ; X86-NEXT:    jne .LBB11_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -200,13 +202,13 @@ define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length3_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorw (%edx), %cx
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    xorb 2(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orw %cx, %ax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
@@ -218,11 +220,11 @@ define i32 @length4(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
@@ -236,9 +238,9 @@ define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
@@ -250,11 +252,11 @@ define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
@@ -267,11 +269,11 @@ define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length4_gt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    retl
@@ -298,15 +300,15 @@ define i32 @length5(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    jne .LBB18_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -324,13 +326,13 @@ define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length5_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    xorb 4(%edx), %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
@@ -344,15 +346,15 @@ define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl (%ecx), %edx
 ; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    jne .LBB20_3
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    jmp .LBB20_2
 ; X86-NEXT:  .LBB20_3: # %res_block
@@ -373,25 +375,25 @@ define i32 @length7(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB21_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB21_3
 ; X86-NEXT:  .LBB21_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB21_3: # %endblock
@@ -405,25 +407,25 @@ define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length7_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB22_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 3(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB22_3
 ; X86-NEXT:  .LBB22_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB22_3: # %endblock
@@ -440,12 +442,12 @@ define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length7_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 3(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 3(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
@@ -457,25 +459,25 @@ define i32 @length8(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl (%eax), %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    jne .LBB24_2
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    je .LBB24_3
 ; X86-NEXT:  .LBB24_2: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB24_3: # %endblock
@@ -489,12 +491,12 @@ define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length8_eq:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
@@ -506,10 +508,10 @@ define i1 @length8_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length8_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    movl $926299444, %ecx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    movl $858927408, %edx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
@@ -521,9 +523,11 @@ define i1 @length8_eq_const(ptr %X) nounwind {
 define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length9_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $9
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -537,9 +541,11 @@ define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length10_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $10
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -553,9 +559,11 @@ define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length11_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $11
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -569,9 +577,11 @@ define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length12_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -585,9 +595,11 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length12:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -598,9 +610,11 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length13_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $13
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -614,9 +628,11 @@ define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length14_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $14
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -630,9 +646,11 @@ define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
 define i32 @length15(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length15:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -643,9 +661,11 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
 define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length15_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -659,9 +679,10 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length15_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -672,9 +693,11 @@ define i32 @length15_const(ptr %X, ptr %Y) nounwind {
 define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length15_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -688,9 +711,10 @@ define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length15_gt_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -706,9 +730,11 @@ define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
 define i32 @length16(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -719,9 +745,11 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
 define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length16_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -730,9 +758,11 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length16_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -742,8 +772,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length16_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
@@ -754,8 +784,8 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length16_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
 ; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
@@ -769,9 +799,11 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
 define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length16_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -785,9 +817,11 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
 define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length16_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -801,9 +835,10 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
 define i1 @length16_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length16_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $16
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -812,9 +847,10 @@ define i1 @length16_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length16_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $16
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -849,9 +885,11 @@ define i1 @length16_eq_const(ptr %X) nounwind {
 define i32 @length24(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length24:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -862,9 +900,11 @@ define i32 @length24(ptr %X, ptr %Y) nounwind {
 define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length24_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -873,9 +913,11 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length24_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -885,9 +927,9 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length24_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
@@ -901,9 +943,9 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length24_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
@@ -920,9 +962,11 @@ define i1 @length24_eq(ptr %x, ptr %y) nounwind {
 define i1 @length24_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length24_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -936,9 +980,11 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind {
 define i1 @length24_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length24_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -952,9 +998,10 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
 define i1 @length24_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length24_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $24
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -963,9 +1010,10 @@ define i1 @length24_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length24_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $24
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1004,9 +1052,11 @@ define i1 @length24_eq_const(ptr %X) nounwind {
 define i32 @length31(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length31:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1017,9 +1067,11 @@ define i32 @length31(ptr %X, ptr %Y) nounwind {
 define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length31_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1028,9 +1080,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length31_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1040,9 +1094,9 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length31_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
@@ -1056,9 +1110,9 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length31_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
@@ -1075,9 +1129,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
 define i1 @length31_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length31_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1091,9 +1147,11 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind {
 define i1 @length31_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length31_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1107,9 +1165,11 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind {
 define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
 ; X86-NOSSE-LABEL: length31_eq_prefer128:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1118,9 +1178,11 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ;
 ; X86-SSE1-LABEL: length31_eq_prefer128:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1130,9 +1192,9 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE2-LABEL: length31_eq_prefer128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
@@ -1146,9 +1208,9 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE41-LABEL: length31_eq_prefer128:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
@@ -1165,9 +1227,10 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 define i1 @length31_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length31_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $31
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1176,9 +1239,10 @@ define i1 @length31_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length31_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $31
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1217,9 +1281,11 @@ define i1 @length31_eq_const(ptr %X) nounwind {
 define i32 @length32(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length32:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1232,9 +1298,11 @@ define i32 @length32(ptr %X, ptr %Y) nounwind {
 define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ; X86-NOSSE-LABEL: length32_eq:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1243,9 +1311,11 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ;
 ; X86-SSE1-LABEL: length32_eq:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1255,9 +1325,9 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-LABEL: length32_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
@@ -1271,9 +1341,9 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-LABEL: length32_eq:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
@@ -1290,9 +1360,11 @@ define i1 @length32_eq(ptr %x, ptr %y) nounwind {
 define i1 @length32_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length32_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1306,9 +1378,11 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind {
 define i1 @length32_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length32_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1322,9 +1396,11 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind {
 define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
 ; X86-NOSSE-LABEL: length32_eq_prefer128:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    pushl %ecx
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1333,9 +1409,11 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ;
 ; X86-SSE1-LABEL: length32_eq_prefer128:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    pushl %ecx
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1345,9 +1423,9 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE2-LABEL: length32_eq_prefer128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
@@ -1361,9 +1439,9 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE41-LABEL: length32_eq_prefer128:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
@@ -1380,9 +1458,10 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 define i1 @length32_eq_const(ptr %X) nounwind {
 ; X86-NOSSE-LABEL: length32_eq_const:
 ; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    pushl $32
 ; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    calll memcmp
 ; X86-NOSSE-NEXT:    addl $12, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
@@ -1391,9 +1470,10 @@ define i1 @length32_eq_const(ptr %X) nounwind {
 ;
 ; X86-SSE1-LABEL: length32_eq_const:
 ; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    pushl $32
 ; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl %eax
 ; X86-SSE1-NEXT:    calll memcmp
 ; X86-SSE1-NEXT:    addl $12, %esp
 ; X86-SSE1-NEXT:    testl %eax, %eax
@@ -1432,9 +1512,11 @@ define i1 @length32_eq_const(ptr %X) nounwind {
 define i32 @length48(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length48:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1445,9 +1527,11 @@ define i32 @length48(ptr %X, ptr %Y) nounwind {
 define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length48_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1461,9 +1545,11 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 define i1 @length48_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length48_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1477,9 +1563,11 @@ define i1 @length48_lt(ptr %x, ptr %y) nounwind {
 define i1 @length48_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length48_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1493,9 +1581,11 @@ define i1 @length48_gt(ptr %x, ptr %y) nounwind {
 define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
 ; X86-LABEL: length48_eq_prefer128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1509,9 +1599,10 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 define i1 @length48_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length48_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $48
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1525,9 +1616,11 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 define i32 @length63(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length63:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1538,9 +1631,11 @@ define i32 @length63(ptr %X, ptr %Y) nounwind {
 define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length63_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1554,9 +1649,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 define i1 @length63_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length63_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1570,9 +1667,11 @@ define i1 @length63_lt(ptr %x, ptr %y) nounwind {
 define i1 @length63_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length63_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1586,9 +1685,10 @@ define i1 @length63_gt(ptr %x, ptr %y) nounwind {
 define i1 @length63_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length63_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $63
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1602,9 +1702,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 define i32 @length64(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1615,9 +1717,11 @@ define i32 @length64(ptr %X, ptr %Y) nounwind {
 define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length64_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1631,9 +1735,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 define i1 @length64_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length64_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1647,9 +1753,11 @@ define i1 @length64_lt(ptr %x, ptr %y) nounwind {
 define i1 @length64_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length64_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1663,9 +1771,10 @@ define i1 @length64_gt(ptr %x, ptr %y) nounwind {
 define i1 @length64_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length64_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $64
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1679,9 +1788,11 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 define i32 @length96(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length96:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1692,9 +1803,11 @@ define i32 @length96(ptr %X, ptr %Y) nounwind {
 define i1 @length96_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length96_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1708,9 +1821,11 @@ define i1 @length96_eq(ptr %x, ptr %y) nounwind {
 define i1 @length96_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length96_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1724,9 +1839,11 @@ define i1 @length96_lt(ptr %x, ptr %y) nounwind {
 define i1 @length96_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length96_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1740,9 +1857,10 @@ define i1 @length96_gt(ptr %x, ptr %y) nounwind {
 define i1 @length96_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length96_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $96
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1756,9 +1874,11 @@ define i1 @length96_eq_const(ptr %X) nounwind {
 define i32 @length127(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length127:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1769,9 +1889,11 @@ define i32 @length127(ptr %X, ptr %Y) nounwind {
 define i1 @length127_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length127_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1785,9 +1907,11 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind {
 define i1 @length127_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length127_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1801,9 +1925,11 @@ define i1 @length127_lt(ptr %x, ptr %y) nounwind {
 define i1 @length127_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length127_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1817,9 +1943,10 @@ define i1 @length127_gt(ptr %x, ptr %y) nounwind {
 define i1 @length127_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length127_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $127
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1833,9 +1960,11 @@ define i1 @length127_eq_const(ptr %X) nounwind {
 define i32 @length128(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length128:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1846,9 +1975,11 @@ define i32 @length128(ptr %X, ptr %Y) nounwind {
 define i1 @length128_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length128_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1862,9 +1993,11 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind {
 define i1 @length128_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length128_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1878,9 +2011,11 @@ define i1 @length128_lt(ptr %x, ptr %y) nounwind {
 define i1 @length128_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length128_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1894,9 +2029,10 @@ define i1 @length128_gt(ptr %x, ptr %y) nounwind {
 define i1 @length128_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length128_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $128
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1910,9 +2046,11 @@ define i1 @length128_eq_const(ptr %X) nounwind {
 define i32 @length192(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length192:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -1923,9 +2061,11 @@ define i32 @length192(ptr %X, ptr %Y) nounwind {
 define i1 @length192_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length192_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1939,9 +2079,11 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind {
 define i1 @length192_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length192_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -1955,9 +2097,11 @@ define i1 @length192_lt(ptr %x, ptr %y) nounwind {
 define i1 @length192_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length192_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1971,9 +2115,10 @@ define i1 @length192_gt(ptr %x, ptr %y) nounwind {
 define i1 @length192_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length192_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $192
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -1987,9 +2132,11 @@ define i1 @length192_eq_const(ptr %X) nounwind {
 define i32 @length255(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length255:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2000,9 +2147,11 @@ define i32 @length255(ptr %X, ptr %Y) nounwind {
 define i1 @length255_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length255_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2016,9 +2165,11 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind {
 define i1 @length255_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length255_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2032,9 +2183,11 @@ define i1 @length255_lt(ptr %x, ptr %y) nounwind {
 define i1 @length255_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length255_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2048,9 +2201,10 @@ define i1 @length255_gt(ptr %x, ptr %y) nounwind {
 define i1 @length255_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length255_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $255
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2064,9 +2218,11 @@ define i1 @length255_eq_const(ptr %X) nounwind {
 define i32 @length256(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length256:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2077,9 +2233,11 @@ define i32 @length256(ptr %X, ptr %Y) nounwind {
 define i1 @length256_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length256_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2093,9 +2251,11 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind {
 define i1 @length256_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length256_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2109,9 +2269,11 @@ define i1 @length256_lt(ptr %x, ptr %y) nounwind {
 define i1 @length256_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length256_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2125,9 +2287,10 @@ define i1 @length256_gt(ptr %x, ptr %y) nounwind {
 define i1 @length256_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length256_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $256 # imm = 0x100
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2141,9 +2304,11 @@ define i1 @length256_eq_const(ptr %X) nounwind {
 define i32 @length384(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length384:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2154,9 +2319,11 @@ define i32 @length384(ptr %X, ptr %Y) nounwind {
 define i1 @length384_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length384_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2170,9 +2337,11 @@ define i1 @length384_eq(ptr %x, ptr %y) nounwind {
 define i1 @length384_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length384_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2186,9 +2355,11 @@ define i1 @length384_lt(ptr %x, ptr %y) nounwind {
 define i1 @length384_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length384_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2202,9 +2373,10 @@ define i1 @length384_gt(ptr %x, ptr %y) nounwind {
 define i1 @length384_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length384_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $384 # imm = 0x180
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2218,9 +2390,11 @@ define i1 @length384_eq_const(ptr %X) nounwind {
 define i32 @length511(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length511:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2231,9 +2405,11 @@ define i32 @length511(ptr %X, ptr %Y) nounwind {
 define i1 @length511_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length511_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2247,9 +2423,11 @@ define i1 @length511_eq(ptr %x, ptr %y) nounwind {
 define i1 @length511_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length511_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2263,9 +2441,11 @@ define i1 @length511_lt(ptr %x, ptr %y) nounwind {
 define i1 @length511_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length511_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2279,9 +2459,10 @@ define i1 @length511_gt(ptr %x, ptr %y) nounwind {
 define i1 @length511_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length511_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $511 # imm = 0x1FF
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2295,9 +2476,11 @@ define i1 @length511_eq_const(ptr %X) nounwind {
 define i32 @length512(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length512:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2308,9 +2491,11 @@ define i32 @length512(ptr %X, ptr %Y) nounwind {
 define i1 @length512_eq(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length512_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2324,9 +2509,11 @@ define i1 @length512_eq(ptr %x, ptr %y) nounwind {
 define i1 @length512_lt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length512_lt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    shrl $31, %eax
@@ -2340,9 +2527,11 @@ define i1 @length512_lt(ptr %x, ptr %y) nounwind {
 define i1 @length512_gt(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: length512_gt:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2356,9 +2545,10 @@ define i1 @length512_gt(ptr %x, ptr %y) nounwind {
 define i1 @length512_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length512_eq_const:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl $512 # imm = 0x200
 ; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2373,9 +2563,11 @@ define i1 @length512_eq_const(ptr %X) nounwind {
 define i32 @huge_length(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: huge_length:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -2386,9 +2578,11 @@ define i32 @huge_length(ptr %X, ptr %Y) nounwind {
 define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: huge_length_eq:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
@@ -2411,9 +2605,12 @@ define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
 define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
 ; X86-LABEL: nonconst_length_eq:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    testl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/memcpy-2.ll b/llvm/test/CodeGen/X86/memcpy-2.ll
index 25d2753480131..1d08df7f3c8c0 100644
--- a/llvm/test/CodeGen/X86/memcpy-2.ll
+++ b/llvm/test/CodeGen/X86/memcpy-2.ll
@@ -79,39 +79,39 @@ define void @t2(ptr nocapture %a, ptr nocapture %b) nounwind ssp {
 ; SSE2-Darwin-LABEL: t2:
 ; SSE2-Darwin:       ## %bb.0: ## %entry
 ; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-Darwin-NEXT:    movaps (%ecx), %xmm0
+; SSE2-Darwin-NEXT:    movaps (%eax), %xmm0
+; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-Darwin-NEXT:    movaps %xmm0, (%eax)
 ; SSE2-Darwin-NEXT:    retl
 ;
 ; SSE2-Mingw32-LABEL: t2:
 ; SSE2-Mingw32:       # %bb.0: # %entry
 ; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE2-Mingw32-NEXT:    movaps (%ecx), %xmm0
+; SSE2-Mingw32-NEXT:    movaps (%eax), %xmm0
+; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-Mingw32-NEXT:    movaps %xmm0, (%eax)
 ; SSE2-Mingw32-NEXT:    retl
 ;
 ; SSE1-LABEL: t2:
 ; SSE1:       ## %bb.0: ## %entry
 ; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE1-NEXT:    movaps (%ecx), %xmm0
+; SSE1-NEXT:    movaps (%eax), %xmm0
+; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE1-NEXT:    movaps %xmm0, (%eax)
 ; SSE1-NEXT:    retl
 ;
 ; NOSSE-LABEL: t2:
 ; NOSSE:       ## %bb.0: ## %entry
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movl 12(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 12(%eax)
-; NOSSE-NEXT:    movl 8(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 8(%eax)
-; NOSSE-NEXT:    movl (%ecx), %edx
-; NOSSE-NEXT:    movl 4(%ecx), %ecx
-; NOSSE-NEXT:    movl %ecx, 4(%eax)
-; NOSSE-NEXT:    movl %edx, (%eax)
+; NOSSE-NEXT:    movl 12(%eax), %ecx
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %ecx, 12(%edx)
+; NOSSE-NEXT:    movl 8(%eax), %ecx
+; NOSSE-NEXT:    movl %ecx, 8(%edx)
+; NOSSE-NEXT:    movl (%eax), %ecx
+; NOSSE-NEXT:    movl 4(%eax), %eax
+; NOSSE-NEXT:    movl %eax, 4(%edx)
+; NOSSE-NEXT:    movl %ecx, (%edx)
 ; NOSSE-NEXT:    retl
 ;
 ; X86-64-LABEL: t2:
@@ -134,9 +134,9 @@ define void @t3(ptr nocapture %a, ptr nocapture %b) nounwind ssp {
 ; SSE2-Darwin-LABEL: t3:
 ; SSE2-Darwin:       ## %bb.0: ## %entry
 ; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE2-Darwin-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-Darwin-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-Darwin-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-Darwin-NEXT:    movsd %xmm1, 8(%eax)
 ; SSE2-Darwin-NEXT:    movsd %xmm0, (%eax)
 ; SSE2-Darwin-NEXT:    retl
@@ -144,9 +144,9 @@ define void @t3(ptr nocapture %a, ptr nocapture %b) nounwind ssp {
 ; SSE2-Mingw32-LABEL: t3:
 ; SSE2-Mingw32:       # %bb.0: # %entry
 ; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE2-Mingw32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-Mingw32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-Mingw32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE2-Mingw32-NEXT:    movsd %xmm1, 8(%eax)
 ; SSE2-Mingw32-NEXT:    movsd %xmm0, (%eax)
 ; SSE2-Mingw32-NEXT:    retl
@@ -154,29 +154,29 @@ define void @t3(ptr nocapture %a, ptr nocapture %b) nounwind ssp {
 ; SSE1-LABEL: t3:
 ; SSE1:       ## %bb.0: ## %entry
 ; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE1-NEXT:    movl 12(%ecx), %edx
-; SSE1-NEXT:    movl %edx, 12(%eax)
-; SSE1-NEXT:    movl 8(%ecx), %edx
-; SSE1-NEXT:    movl %edx, 8(%eax)
-; SSE1-NEXT:    movl (%ecx), %edx
-; SSE1-NEXT:    movl 4(%ecx), %ecx
-; SSE1-NEXT:    movl %ecx, 4(%eax)
-; SSE1-NEXT:    movl %edx, (%eax)
+; SSE1-NEXT:    movl 12(%eax), %ecx
+; SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; SSE1-NEXT:    movl %ecx, 12(%edx)
+; SSE1-NEXT:    movl 8(%eax), %ecx
+; SSE1-NEXT:    movl %ecx, 8(%edx)
+; SSE1-NEXT:    movl (%eax), %ecx
+; SSE1-NEXT:    movl 4(%eax), %eax
+; SSE1-NEXT:    movl %eax, 4(%edx)
+; SSE1-NEXT:    movl %ecx, (%edx)
 ; SSE1-NEXT:    retl
 ;
 ; NOSSE-LABEL: t3:
 ; NOSSE:       ## %bb.0: ## %entry
 ; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; NOSSE-NEXT:    movl 12(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 12(%eax)
-; NOSSE-NEXT:    movl 8(%ecx), %edx
-; NOSSE-NEXT:    movl %edx, 8(%eax)
-; NOSSE-NEXT:    movl (%ecx), %edx
-; NOSSE-NEXT:    movl 4(%ecx), %ecx
-; NOSSE-NEXT:    movl %ecx, 4(%eax)
-; NOSSE-NEXT:    movl %edx, (%eax)
+; NOSSE-NEXT:    movl 12(%eax), %ecx
+; NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOSSE-NEXT:    movl %ecx, 12(%edx)
+; NOSSE-NEXT:    movl 8(%eax), %ecx
+; NOSSE-NEXT:    movl %ecx, 8(%edx)
+; NOSSE-NEXT:    movl (%eax), %ecx
+; NOSSE-NEXT:    movl 4(%eax), %eax
+; NOSSE-NEXT:    movl %eax, 4(%edx)
+; NOSSE-NEXT:    movl %ecx, (%edx)
 ; NOSSE-NEXT:    retl
 ;
 ; X86-64-LABEL: t3:
diff --git a/llvm/test/CodeGen/X86/memset-2.ll b/llvm/test/CodeGen/X86/memset-2.ll
index a4fbbc5700470..d759e35144c1d 100644
--- a/llvm/test/CodeGen/X86/memset-2.ll
+++ b/llvm/test/CodeGen/X86/memset-2.ll
@@ -36,11 +36,11 @@ declare void @llvm.memset.p0.i32(ptr nocapture, i8, i32, i1) nounwind
 define void @t3(ptr nocapture %s, i8 %a) nounwind {
 ; CHECK-LABEL: t3:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    imull $16843009, %ecx, %ecx ## imm = 0x1010101
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    imull $16843009, %eax, %eax ## imm = 0x1010101
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %eax, 4(%ecx)
+; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    retl
 entry:
   tail call void @llvm.memset.p0.i32(ptr %s, i8 %a, i32 8, i1 false)
@@ -50,13 +50,13 @@ entry:
 define void @t4(ptr nocapture %s, i8 %a) nounwind {
 ; CHECK-LABEL: t4:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    imull $16843009, %ecx, %ecx ## imm = 0x1010101
-; CHECK-NEXT:    movl %ecx, 11(%eax)
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    movl %ecx, 4(%eax)
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    imull $16843009, %eax, %eax ## imm = 0x1010101
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %eax, 11(%ecx)
+; CHECK-NEXT:    movl %eax, 8(%ecx)
+; CHECK-NEXT:    movl %eax, 4(%ecx)
+; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    retl
 entry:
   tail call void @llvm.memset.p0.i32(ptr %s, i8 %a, i32 15, i1 false)
diff --git a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll
index a5ecdab880a6a..4506cc4ac9e70 100644
--- a/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll
+++ b/llvm/test/CodeGen/X86/memset-sse-stack-realignment.ll
@@ -14,7 +14,6 @@ define void @test1(i32 %t) nounwind {
 ; NOSSE-NEXT:    pushl %ebp
 ; NOSSE-NEXT:    movl %esp, %ebp
 ; NOSSE-NEXT:    subl $32, %esp
-; NOSSE-NEXT:    movl 8(%ebp), %eax
 ; NOSSE-NEXT:    movl $0, -4(%ebp)
 ; NOSSE-NEXT:    movl $0, -8(%ebp)
 ; NOSSE-NEXT:    movl $0, -12(%ebp)
@@ -23,6 +22,7 @@ define void @test1(i32 %t) nounwind {
 ; NOSSE-NEXT:    movl $0, -24(%ebp)
 ; NOSSE-NEXT:    movl $0, -28(%ebp)
 ; NOSSE-NEXT:    movl $0, -32(%ebp)
+; NOSSE-NEXT:    movl 8(%ebp), %eax
 ; NOSSE-NEXT:    addl $3, %eax
 ; NOSSE-NEXT:    andl $-4, %eax
 ; NOSSE-NEXT:    calll __alloca
@@ -41,10 +41,10 @@ define void @test1(i32 %t) nounwind {
 ; SSE-NEXT:    andl $-16, %esp
 ; SSE-NEXT:    subl $48, %esp
 ; SSE-NEXT:    movl %esp, %esi
-; SSE-NEXT:    movl 8(%ebp), %eax
 ; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    movaps %xmm0, 16(%esi)
 ; SSE-NEXT:    movaps %xmm0, (%esi)
+; SSE-NEXT:    movl 8(%ebp), %eax
 ; SSE-NEXT:    addl $3, %eax
 ; SSE-NEXT:    andl $-4, %eax
 ; SSE-NEXT:    calll __alloca
@@ -64,9 +64,9 @@ define void @test1(i32 %t) nounwind {
 ; AVX-NEXT:    andl $-32, %esp
 ; AVX-NEXT:    subl $64, %esp
 ; AVX-NEXT:    movl %esp, %esi
-; AVX-NEXT:    movl 8(%ebp), %eax
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovaps %ymm0, (%esi)
+; AVX-NEXT:    movl 8(%ebp), %eax
 ; AVX-NEXT:    addl $3, %eax
 ; AVX-NEXT:    andl $-4, %eax
 ; AVX-NEXT:    calll __alloca
@@ -91,11 +91,11 @@ define void @test2(i32 %t) nounwind {
 ; NOSSE-NEXT:    pushl %ebp
 ; NOSSE-NEXT:    movl %esp, %ebp
 ; NOSSE-NEXT:    subl $16, %esp
-; NOSSE-NEXT:    movl 8(%ebp), %eax
 ; NOSSE-NEXT:    movl $0, -4(%ebp)
 ; NOSSE-NEXT:    movl $0, -8(%ebp)
 ; NOSSE-NEXT:    movl $0, -12(%ebp)
 ; NOSSE-NEXT:    movl $0, -16(%ebp)
+; NOSSE-NEXT:    movl 8(%ebp), %eax
 ; NOSSE-NEXT:    addl $3, %eax
 ; NOSSE-NEXT:    andl $-4, %eax
 ; NOSSE-NEXT:    calll __alloca
@@ -114,9 +114,9 @@ define void @test2(i32 %t) nounwind {
 ; SSE-NEXT:    andl $-16, %esp
 ; SSE-NEXT:    subl $32, %esp
 ; SSE-NEXT:    movl %esp, %esi
-; SSE-NEXT:    movl 8(%ebp), %eax
 ; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    movaps %xmm0, (%esi)
+; SSE-NEXT:    movl 8(%ebp), %eax
 ; SSE-NEXT:    addl $3, %eax
 ; SSE-NEXT:    andl $-4, %eax
 ; SSE-NEXT:    calll __alloca
@@ -136,9 +136,9 @@ define void @test2(i32 %t) nounwind {
 ; AVX-NEXT:    andl $-16, %esp
 ; AVX-NEXT:    subl $32, %esp
 ; AVX-NEXT:    movl %esp, %esi
-; AVX-NEXT:    movl 8(%ebp), %eax
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, (%esi)
+; AVX-NEXT:    movl 8(%ebp), %eax
 ; AVX-NEXT:    addl $3, %eax
 ; AVX-NEXT:    andl $-4, %eax
 ; AVX-NEXT:    calll __alloca
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
index 480a0970bd39d..88afd317b792c 100644
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -6,8 +6,8 @@
 define void @bork(ptr nocapture align 4 %dst) nounwind {
 ; FAST-LABEL: bork:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-NEXT:    xorps %xmm0, %xmm0
+; FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-NEXT:    movups %xmm0, 64(%eax)
 ; FAST-NEXT:    movups %xmm0, 48(%eax)
 ; FAST-NEXT:    movups %xmm0, 32(%eax)
@@ -17,8 +17,8 @@ define void @bork(ptr nocapture align 4 %dst) nounwind {
 ;
 ; SLOW_32-LABEL: bork:
 ; SLOW_32:       # %bb.0:
-; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SLOW_32-NEXT:    xorps %xmm0, %xmm0
+; SLOW_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SLOW_32-NEXT:    movsd %xmm0, 72(%eax)
 ; SLOW_32-NEXT:    movsd %xmm0, 64(%eax)
 ; SLOW_32-NEXT:    movsd %xmm0, 56(%eax)
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index d8be4cfd27238..9396e7b4feb38 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -23,9 +23,8 @@ define <2 x double> @merge_2f64_f64_23(ptr %ptr) nounwind uwtable noinline ssp {
 ; X86-SSE1-LABEL: merge_2f64_f64_23:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    fldl 16(%eax)
 ; X86-SSE1-NEXT:    fldl 24(%eax)
-; X86-SSE1-NEXT:    fxch %st(1)
+; X86-SSE1-NEXT:    fldl 16(%eax)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE41-LABEL: merge_2f64_f64_23:
@@ -55,26 +54,16 @@ define <2 x i64> @merge_2i64_i64_12(ptr %ptr) nounwind uwtable noinline ssp {
 ;
 ; X86-SSE1-LABEL: merge_2i64_i64_12:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    .cfi_offset %esi, -12
-; X86-SSE1-NEXT:    .cfi_offset %edi, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl 20(%ecx), %edx
+; X86-SSE1-NEXT:    movl %edx, 12(%eax)
+; X86-SSE1-NEXT:    movl 16(%ecx), %edx
+; X86-SSE1-NEXT:    movl %edx, 8(%eax)
 ; X86-SSE1-NEXT:    movl 8(%ecx), %edx
-; X86-SSE1-NEXT:    movl 12(%ecx), %esi
-; X86-SSE1-NEXT:    movl 16(%ecx), %edi
-; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
-; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE1-NEXT:    movl %edi, 8(%eax)
-; X86-SSE1-NEXT:    movl %esi, 4(%eax)
+; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
+; X86-SSE1-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE1-NEXT:    movl %edx, (%eax)
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl $4
 ;
 ; X86-SSE41-LABEL: merge_2i64_i64_12:
@@ -158,8 +147,8 @@ define <4 x float> @merge_4f32_f32_34uu(ptr %ptr) nounwind uwtable noinline ssp
 ;
 ; X86-SSE1-LABEL: merge_4f32_f32_34uu:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    retl
 ;
@@ -240,8 +229,8 @@ define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp
 ;
 ; X86-SSE1-LABEL: merge_4f32_f32_45zz:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    retl
 ;
@@ -281,8 +270,8 @@ define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp
 ;
 ; X86-SSE1-LABEL: merge_4f32_f32_012u:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -328,8 +317,8 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp
 ;
 ; X86-SSE1-LABEL: merge_4f32_f32_019u:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -406,9 +395,9 @@ define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp {
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    movl 8(%ecx), %edx
 ; X86-SSE1-NEXT:    movl 12(%ecx), %esi
-; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
 ; X86-SSE1-NEXT:    movl %esi, 4(%eax)
 ; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
 ; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
 ; X86-SSE1-NEXT:    popl %esi
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
@@ -458,9 +447,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc2(ptr %ptr) nounwind uwtable noinline s
 ; X86-SSE1-NEXT:    movl 12(%ecx), %esi
 ; X86-SSE1-NEXT:    leal 1(%edx), %edi
 ; X86-SSE1-NEXT:    movl %edi, 8(%ecx)
-; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
 ; X86-SSE1-NEXT:    movl %esi, 4(%eax)
 ; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
 ; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
 ; X86-SSE1-NEXT:    popl %esi
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
@@ -515,9 +504,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc3(ptr %ptr) nounwind uwtable noinline s
 ; X86-SSE1-NEXT:    movl 12(%ecx), %esi
 ; X86-SSE1-NEXT:    leal 1(%esi), %edi
 ; X86-SSE1-NEXT:    movl %edi, 12(%ecx)
-; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
 ; X86-SSE1-NEXT:    movl %esi, 4(%eax)
 ; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    movl 20(%ecx), %ecx
 ; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
 ; X86-SSE1-NEXT:    popl %esi
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
@@ -771,25 +760,20 @@ define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
 ;
 ; X86-SSE1-LABEL: merge_v4i32_i32_3210:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    .cfi_offset %esi, -12
-; X86-SSE1-NEXT:    .cfi_offset %edi, -8
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl 12(%ecx), %edx
-; X86-SSE1-NEXT:    movl 8(%ecx), %esi
-; X86-SSE1-NEXT:    movl (%ecx), %edi
-; X86-SSE1-NEXT:    movl 4(%ecx), %ecx
-; X86-SSE1-NEXT:    movl %edi, 12(%eax)
-; X86-SSE1-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE1-NEXT:    movl %esi, 4(%eax)
-; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    movl (%ecx), %edx
+; X86-SSE1-NEXT:    movl 4(%ecx), %esi
+; X86-SSE1-NEXT:    movl %edx, 12(%eax)
+; X86-SSE1-NEXT:    movl %esi, 8(%eax)
+; X86-SSE1-NEXT:    movl 8(%ecx), %edx
+; X86-SSE1-NEXT:    movl %edx, 4(%eax)
+; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%eax)
 ; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %edi
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl $4
 ;
@@ -827,26 +811,16 @@ define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ss
 ;
 ; X86-SSE1-LABEL: merge_8i16_i16_23u567u9:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    .cfi_offset %esi, -12
-; X86-SSE1-NEXT:    .cfi_offset %edi, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movzwl 14(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 10(%eax)
+; X86-SSE1-NEXT:    movzwl 18(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 14(%eax)
 ; X86-SSE1-NEXT:    movl 4(%ecx), %edx
-; X86-SSE1-NEXT:    movl 10(%ecx), %esi
-; X86-SSE1-NEXT:    movzwl 14(%ecx), %edi
-; X86-SSE1-NEXT:    movzwl 18(%ecx), %ecx
-; X86-SSE1-NEXT:    movw %di, 10(%eax)
-; X86-SSE1-NEXT:    movw %cx, 14(%eax)
-; X86-SSE1-NEXT:    movl %esi, 6(%eax)
+; X86-SSE1-NEXT:    movl 10(%ecx), %ecx
+; X86-SSE1-NEXT:    movl %ecx, 6(%eax)
 ; X86-SSE1-NEXT:    movl %edx, (%eax)
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl $4
 ;
 ; X86-SSE41-LABEL: merge_8i16_i16_23u567u9:
@@ -923,10 +897,10 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl 8(%ecx), %edx
-; X86-SSE1-NEXT:    movzwl 14(%ecx), %ecx
-; X86-SSE1-NEXT:    movw %cx, 6(%eax)
-; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    movzwl 14(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 6(%eax)
+; X86-SSE1-NEXT:    movl 8(%ecx), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%eax)
 ; X86-SSE1-NEXT:    movl $0, 12(%eax)
 ; X86-SSE1-NEXT:    movl $0, 8(%eax)
 ; X86-SSE1-NEXT:    retl $4
@@ -975,51 +949,28 @@ define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ss
 ;
 ; X86-SSE1-LABEL: merge_8i16_i16_76543210:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %ebp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    pushl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
-; X86-SSE1-NEXT:    pushl %eax
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
-; X86-SSE1-NEXT:    .cfi_offset %esi, -20
-; X86-SSE1-NEXT:    .cfi_offset %edi, -16
-; X86-SSE1-NEXT:    .cfi_offset %ebx, -12
-; X86-SSE1-NEXT:    .cfi_offset %ebp, -8
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movzwl 14(%eax), %ecx
-; X86-SSE1-NEXT:    movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-SSE1-NEXT:    movzwl 12(%eax), %ecx
-; X86-SSE1-NEXT:    movw %cx, (%esp) # 2-byte Spill
-; X86-SSE1-NEXT:    movzwl 10(%eax), %esi
-; X86-SSE1-NEXT:    movzwl 8(%eax), %edi
-; X86-SSE1-NEXT:    movzwl 6(%eax), %ebx
-; X86-SSE1-NEXT:    movzwl 4(%eax), %ebp
-; X86-SSE1-NEXT:    movzwl (%eax), %ecx
-; X86-SSE1-NEXT:    movzwl 2(%eax), %edx
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movw %cx, 14(%eax)
-; X86-SSE1-NEXT:    movw %dx, 12(%eax)
-; X86-SSE1-NEXT:    movw %bp, 10(%eax)
-; X86-SSE1-NEXT:    movw %bx, 8(%eax)
-; X86-SSE1-NEXT:    movw %di, 6(%eax)
-; X86-SSE1-NEXT:    movw %si, 4(%eax)
-; X86-SSE1-NEXT:    movzwl (%esp), %ecx # 2-byte Folded Reload
-; X86-SSE1-NEXT:    movw %cx, 2(%eax)
-; X86-SSE1-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 2-byte Folded Reload
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movzwl (%ecx), %edx
+; X86-SSE1-NEXT:    movzwl 2(%ecx), %esi
+; X86-SSE1-NEXT:    movw %dx, 14(%eax)
+; X86-SSE1-NEXT:    movw %si, 12(%eax)
+; X86-SSE1-NEXT:    movzwl 4(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 10(%eax)
+; X86-SSE1-NEXT:    movzwl 6(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 8(%eax)
+; X86-SSE1-NEXT:    movzwl 8(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 6(%eax)
+; X86-SSE1-NEXT:    movzwl 10(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 4(%eax)
+; X86-SSE1-NEXT:    movzwl 12(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 2(%eax)
+; X86-SSE1-NEXT:    movzwl 14(%ecx), %ecx
 ; X86-SSE1-NEXT:    movw %cx, (%eax)
-; X86-SSE1-NEXT:    addl $4, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
 ; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE1-NEXT:    popl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    popl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %ebp
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl $4
 ;
@@ -1069,40 +1020,20 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noin
 ;
 ; X86-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %ebp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    pushl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
-; X86-SSE1-NEXT:    .cfi_offset %esi, -20
-; X86-SSE1-NEXT:    .cfi_offset %edi, -16
-; X86-SSE1-NEXT:    .cfi_offset %ebx, -12
-; X86-SSE1-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movzwl (%ecx), %ebp
-; X86-SSE1-NEXT:    movl 3(%ecx), %esi
-; X86-SSE1-NEXT:    movl 7(%ecx), %edi
-; X86-SSE1-NEXT:    movzwl 11(%ecx), %ebx
 ; X86-SSE1-NEXT:    movzbl 13(%ecx), %edx
-; X86-SSE1-NEXT:    movzbl 15(%ecx), %ecx
 ; X86-SSE1-NEXT:    movb %dl, 13(%eax)
-; X86-SSE1-NEXT:    movb %cl, 15(%eax)
-; X86-SSE1-NEXT:    movw %bx, 11(%eax)
-; X86-SSE1-NEXT:    movl %edi, 7(%eax)
-; X86-SSE1-NEXT:    movl %esi, 3(%eax)
-; X86-SSE1-NEXT:    movw %bp, (%eax)
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE1-NEXT:    popl %edi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    popl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %ebp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    movzbl 15(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 15(%eax)
+; X86-SSE1-NEXT:    movzwl 11(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 11(%eax)
+; X86-SSE1-NEXT:    movl 7(%ecx), %edx
+; X86-SSE1-NEXT:    movl %edx, 7(%eax)
+; X86-SSE1-NEXT:    movl 3(%ecx), %edx
+; X86-SSE1-NEXT:    movl %edx, 3(%eax)
+; X86-SSE1-NEXT:    movzwl (%ecx), %ecx
+; X86-SSE1-NEXT:    movw %cx, (%eax)
 ; X86-SSE1-NEXT:    retl $4
 ;
 ; X86-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
@@ -1169,10 +1100,10 @@ define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(ptr %ptr) nounwind uwtable noin
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movzwl (%ecx), %edx
-; X86-SSE1-NEXT:    movzbl 3(%ecx), %ecx
-; X86-SSE1-NEXT:    movb %cl, 3(%eax)
-; X86-SSE1-NEXT:    movw %dx, (%eax)
+; X86-SSE1-NEXT:    movzbl 3(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 3(%eax)
+; X86-SSE1-NEXT:    movzwl (%ecx), %ecx
+; X86-SSE1-NEXT:    movw %cx, (%eax)
 ; X86-SSE1-NEXT:    movb $0, 15(%eax)
 ; X86-SSE1-NEXT:    movw $0, 13(%eax)
 ; X86-SSE1-NEXT:    movw $0, 6(%eax)
@@ -1214,10 +1145,10 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%ecx), %edx
-; X86-SSE1-NEXT:    movzwl 6(%ecx), %ecx
-; X86-SSE1-NEXT:    movw %cx, 6(%eax)
-; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    movzwl 6(%ecx), %edx
+; X86-SSE1-NEXT:    movw %dx, 6(%eax)
+; X86-SSE1-NEXT:    movl (%ecx), %ecx
+; X86-SSE1-NEXT:    movl %ecx, (%eax)
 ; X86-SSE1-NEXT:    movb $0, 15(%eax)
 ; X86-SSE1-NEXT:    movw $0, 13(%eax)
 ; X86-SSE1-NEXT:    retl $4
@@ -1281,74 +1212,40 @@ define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noin
 ;
 ; X86-SSE1-LABEL: merge_16i8_i8_FEDCBA9876543210:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    subl $12, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
-; X86-SSE1-NEXT:    .cfi_offset %esi, -12
-; X86-SSE1-NEXT:    .cfi_offset %ebx, -8
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE1-NEXT:    movzbl 15(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 14(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 13(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 12(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 11(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 10(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 9(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 8(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 7(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movzbl 6(%esi), %ecx
-; X86-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-SSE1-NEXT:    movb 5(%esi), %bh
-; X86-SSE1-NEXT:    movb 4(%esi), %bl
-; X86-SSE1-NEXT:    movb 3(%esi), %dh
-; X86-SSE1-NEXT:    movb 2(%esi), %ch
-; X86-SSE1-NEXT:    movb (%esi), %cl
-; X86-SSE1-NEXT:    movb 1(%esi), %dl
-; X86-SSE1-NEXT:    movb %cl, 15(%eax)
-; X86-SSE1-NEXT:    movb %dl, 14(%eax)
-; X86-SSE1-NEXT:    movb %ch, 13(%eax)
-; X86-SSE1-NEXT:    movb %dh, 12(%eax)
-; X86-SSE1-NEXT:    movb %bl, 11(%eax)
-; X86-SSE1-NEXT:    movb %bh, 10(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 9(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 8(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 7(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 6(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 5(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 4(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 3(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 2(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SSE1-NEXT:    movb %cl, 1(%eax)
-; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movzbl (%ecx), %edx
+; X86-SSE1-NEXT:    movb 1(%ecx), %dh
+; X86-SSE1-NEXT:    movb %dl, 15(%eax)
+; X86-SSE1-NEXT:    movb %dh, 14(%eax)
+; X86-SSE1-NEXT:    movzbl 2(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 13(%eax)
+; X86-SSE1-NEXT:    movzbl 3(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 12(%eax)
+; X86-SSE1-NEXT:    movzbl 4(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 11(%eax)
+; X86-SSE1-NEXT:    movzbl 5(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 10(%eax)
+; X86-SSE1-NEXT:    movzbl 6(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 9(%eax)
+; X86-SSE1-NEXT:    movzbl 7(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 8(%eax)
+; X86-SSE1-NEXT:    movzbl 8(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 7(%eax)
+; X86-SSE1-NEXT:    movzbl 9(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 6(%eax)
+; X86-SSE1-NEXT:    movzbl 10(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 5(%eax)
+; X86-SSE1-NEXT:    movzbl 11(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 4(%eax)
+; X86-SSE1-NEXT:    movzbl 12(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 3(%eax)
+; X86-SSE1-NEXT:    movzbl 13(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 2(%eax)
+; X86-SSE1-NEXT:    movzbl 14(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, 1(%eax)
+; X86-SSE1-NEXT:    movzbl 15(%ecx), %ecx
 ; X86-SSE1-NEXT:    movb %cl, (%eax)
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    popl %ebx
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE1-NEXT:    retl $4
 ;
 ; X86-SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
@@ -1424,18 +1321,18 @@ define void @merge_4i32_i32_combine(ptr %dst, ptr %src) {
 ; X86-SSE1-LABEL: merge_4i32_i32_combine:
 ; X86-SSE1:       # %bb.0:
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE1-NEXT:    andps %xmm0, %xmm1
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movaps %xmm1, (%eax)
 ; X86-SSE1-NEXT:    retl
 ;
 ; X86-SSE41-LABEL: merge_4i32_i32_combine:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE41-NEXT:    retl
  %1 = load i32, ptr %src
@@ -1586,8 +1483,8 @@ define <4 x float> @merge_4f32_f32_X0YY(ptr %ptr0, ptr %ptr1) nounwind uwtable n
 ; X86-SSE-LABEL: merge_4f32_f32_X0YY:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 ; X86-SSE-NEXT:    retl
@@ -1687,22 +1584,20 @@ define <4 x i32> @no_reverse_vzload(ptr %p0) nounwind {
 ;
 ; X86-SSE1-LABEL: no_reverse_vzload:
 ; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %ebx
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    xorl %ecx, %ecx
-; X86-SSE1-NEXT:    cmpl $0, (%edx)
-; X86-SSE1-NEXT:    setg %cl
-; X86-SSE1-NEXT:    negl %ecx
-; X86-SSE1-NEXT:    xorl %ebx, %ebx
-; X86-SSE1-NEXT:    cmpl $0, 4(%edx)
-; X86-SSE1-NEXT:    setg %bl
-; X86-SSE1-NEXT:    negl %ebx
-; X86-SSE1-NEXT:    movl %ebx, 4(%eax)
-; X86-SSE1-NEXT:    movl %ecx, (%eax)
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    xorl %edx, %edx
+; X86-SSE1-NEXT:    cmpl $0, 4(%ecx)
+; X86-SSE1-NEXT:    setg %dl
+; X86-SSE1-NEXT:    negl %edx
+; X86-SSE1-NEXT:    movl %edx, 4(%eax)
+; X86-SSE1-NEXT:    xorl %edx, %edx
+; X86-SSE1-NEXT:    cmpl $0, (%ecx)
+; X86-SSE1-NEXT:    setg %dl
+; X86-SSE1-NEXT:    negl %edx
+; X86-SSE1-NEXT:    movl %edx, (%eax)
 ; X86-SSE1-NEXT:    movl $0, 12(%eax)
 ; X86-SSE1-NEXT:    movl $0, 8(%eax)
-; X86-SSE1-NEXT:    popl %ebx
 ; X86-SSE1-NEXT:    retl $4
 ;
 ; X86-SSE41-LABEL: no_reverse_vzload:
@@ -1710,10 +1605,10 @@ define <4 x i32> @no_reverse_vzload(ptr %p0) nounwind {
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE41-NEXT:    paddd %xmm1, %xmm1
-; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
 ; X86-SSE41-NEXT:    retl
   %i0 = load <2 x i32>, ptr %p0, align 4
   %i1 = shufflevector <2 x i32> %i0, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
index 30853582e0c21..ca456bbac591a 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -329,10 +329,10 @@ define <8 x float> @merge_8f32_2f32_23z5(ptr %ptr) nounwind uwtable noinline ssp
 ; X86-AVX-LABEL: merge_8f32_2f32_23z5:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovups 16(%eax), %xmm0
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX-NEXT:    vmovups 16(%eax), %xmm1
+; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds <2 x float>, ptr %ptr, i64 2
   %ptr1 = getelementptr inbounds <2 x float>, ptr %ptr, i64 3
@@ -912,10 +912,10 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(ptr %ptr) nounwind
 ; X86-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movzwl (%eax), %ecx
 ; X86-AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpinsrw $4, 24(%eax), %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpinsrw $6, 28(%eax), %xmm0, %xmm0
+; X86-AVX-NEXT:    movzwl (%eax), %ecx
 ; X86-AVX-NEXT:    vpinsrw $7, 30(%eax), %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm1
 ; X86-AVX-NEXT:    vpinsrw $3, 6(%eax), %xmm1, %xmm1
@@ -958,8 +958,8 @@ define <2 x i8> @PR42846(ptr %j, <2 x i8> %k) {
 ;
 ; X86-AVX-LABEL: PR42846:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovdqa l, %ymm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vpextrw $0, %xmm0, (%eax)
 ; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; X86-AVX-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
index 9c3057e4e42a4..5dab5c4f9271e 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -15,11 +15,11 @@ define void @merge_2_v4f32_align32(ptr %a0, ptr %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
-; X86-NEXT:    movaps 16(%ecx), %xmm1
-; X86-NEXT:    movntps %xmm0, (%eax)
+; X86-NEXT:    movaps (%eax), %xmm0
+; X86-NEXT:    movaps 16(%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movntps %xmm1, 16(%eax)
+; X86-NEXT:    movntps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: merge_2_v4f32_align32:
@@ -74,11 +74,11 @@ define void @merge_2_v4f32_align32_mix_ntload(ptr %a0, ptr %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align32_mix_ntload:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
-; X86-NEXT:    movaps 16(%ecx), %xmm1
-; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    movaps (%eax), %xmm0
+; X86-NEXT:    movaps 16(%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm1, 16(%eax)
+; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload:
@@ -126,11 +126,11 @@ define void @merge_2_v4f32_align32_mix_ntstore(ptr %a0, ptr %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align32_mix_ntstore:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
-; X86-NEXT:    movaps 16(%ecx), %xmm1
-; X86-NEXT:    movntps %xmm0, (%eax)
+; X86-NEXT:    movaps (%eax), %xmm0
+; X86-NEXT:    movaps 16(%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm1, 16(%eax)
+; X86-NEXT:    movntps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore:
@@ -163,11 +163,11 @@ define void @merge_2_v4f32_align16_ntload(ptr %a0, ptr %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align16_ntload:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
-; X86-NEXT:    movaps 16(%ecx), %xmm1
-; X86-NEXT:    movaps %xmm0, (%eax)
+; X86-NEXT:    movaps (%eax), %xmm0
+; X86-NEXT:    movaps 16(%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm1, 16(%eax)
+; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload:
@@ -216,11 +216,11 @@ define void @merge_2_v4f32_align16_ntstore(ptr %a0, ptr %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align16_ntstore:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
-; X86-NEXT:    movaps 16(%ecx), %xmm1
-; X86-NEXT:    movntps %xmm0, (%eax)
+; X86-NEXT:    movaps (%eax), %xmm0
+; X86-NEXT:    movaps 16(%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movntps %xmm1, 16(%eax)
+; X86-NEXT:    movntps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore:
@@ -253,11 +253,11 @@ define void @merge_2_v4f32_align1_ntload(ptr %a0, ptr %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align1_ntload:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movups (%ecx), %xmm0
-; X86-NEXT:    movups 16(%ecx), %xmm1
-; X86-NEXT:    movups %xmm0, (%eax)
+; X86-NEXT:    movups (%eax), %xmm0
+; X86-NEXT:    movups 16(%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movups %xmm1, 16(%eax)
+; X86-NEXT:    movups %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: merge_2_v4f32_align1_ntload:
@@ -289,11 +289,13 @@ define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind {
 ; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movntil %ecx, (%eax)
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
 ; X86-SSE2-NEXT:    movd %xmm2, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
@@ -303,8 +305,6 @@ define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
-; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
@@ -319,15 +319,15 @@ define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind {
 ; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
 ; X86-SSE4A:       # %bb.0:
 ; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE4A-NEXT:    movsd (%ecx), %xmm0 # xmm0 = mem[0],zero
-; X86-SSE4A-NEXT:    movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero
-; X86-SSE4A-NEXT:    movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero
-; X86-SSE4A-NEXT:    movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero
-; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
-; X86-SSE4A-NEXT:    movntsd %xmm1, 8(%eax)
-; X86-SSE4A-NEXT:    movntsd %xmm3, 24(%eax)
-; X86-SSE4A-NEXT:    movntsd %xmm2, 16(%eax)
+; X86-SSE4A-NEXT:    movsd 16(%eax), %xmm0 # xmm0 = mem[0],zero
+; X86-SSE4A-NEXT:    movsd 24(%eax), %xmm1 # xmm1 = mem[0],zero
+; X86-SSE4A-NEXT:    movsd (%eax), %xmm2 # xmm2 = mem[0],zero
+; X86-SSE4A-NEXT:    movsd 8(%eax), %xmm3 # xmm3 = mem[0],zero
+; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE4A-NEXT:    movntsd %xmm2, (%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm1, 24(%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm0, 16(%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm3, 8(%eax)
 ; X86-SSE4A-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
@@ -400,11 +400,13 @@ define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind {
 ; X86-SSE2-LABEL: merge_2_v4f32_align1:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movntil %ecx, (%eax)
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
 ; X86-SSE2-NEXT:    movd %xmm2, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
@@ -414,8 +416,6 @@ define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
-; X86-SSE2-NEXT:    movd %xmm0, %ecx
-; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
@@ -430,15 +430,15 @@ define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind {
 ; X86-SSE4A-LABEL: merge_2_v4f32_align1:
 ; X86-SSE4A:       # %bb.0:
 ; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE4A-NEXT:    movsd (%ecx), %xmm0 # xmm0 = mem[0],zero
-; X86-SSE4A-NEXT:    movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero
-; X86-SSE4A-NEXT:    movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero
-; X86-SSE4A-NEXT:    movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero
-; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
-; X86-SSE4A-NEXT:    movntsd %xmm1, 8(%eax)
-; X86-SSE4A-NEXT:    movntsd %xmm3, 24(%eax)
-; X86-SSE4A-NEXT:    movntsd %xmm2, 16(%eax)
+; X86-SSE4A-NEXT:    movsd 16(%eax), %xmm0 # xmm0 = mem[0],zero
+; X86-SSE4A-NEXT:    movsd 24(%eax), %xmm1 # xmm1 = mem[0],zero
+; X86-SSE4A-NEXT:    movsd (%eax), %xmm2 # xmm2 = mem[0],zero
+; X86-SSE4A-NEXT:    movsd 8(%eax), %xmm3 # xmm3 = mem[0],zero
+; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE4A-NEXT:    movntsd %xmm2, (%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm1, 24(%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm0, 16(%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm3, 8(%eax)
 ; X86-SSE4A-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: merge_2_v4f32_align1:
diff --git a/llvm/test/CodeGen/X86/merge-sp-update-lea.ll b/llvm/test/CodeGen/X86/merge-sp-update-lea.ll
index 28cbd33ef0972..578d3c5e0f7f4 100644
--- a/llvm/test/CodeGen/X86/merge-sp-update-lea.ll
+++ b/llvm/test/CodeGen/X86/merge-sp-update-lea.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc %s -o - | FileCheck %s
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.5"
@@ -11,6 +12,32 @@ target triple = "i386-apple-macosx10.5"
 ; the LEA instruction.
 ; CHECK-NEXT: leal 24(%esp), %esp
 define noalias ptr @useLEA(ptr nocapture %p, i32 %nbytes) #0 {
+; CHECK-LABEL: useLEA:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    leal -{{[0-9]+}}(%esp), %esp
+; CHECK-NEXT:    movl 12(%ebp), %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    js LBB0_4
+; CHECK-NEXT:  ## %bb.1: ## %cond.false
+; CHECK-NEXT:    movl 8(%ebp), %ecx
+; CHECK-NEXT:    movl %ecx, (%esp)
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne LBB0_3
+; CHECK-NEXT:  ## %bb.2: ## %cond.false
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:  LBB0_3: ## %cond.false
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    calll _realloc
+; CHECK-NEXT:    jmp LBB0_5
+; CHECK-NEXT:  LBB0_4: ## %cond.end.3
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  LBB0_5: ## %cond.end.3
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i32 %nbytes, 0
   br i1 %cmp, label %cond.end.3, label %cond.false
diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll
index 8030d5f08fa57..4b6de11c9c4fc 100644
--- a/llvm/test/CodeGen/X86/merge-store-constants.ll
+++ b/llvm/test/CodeGen/X86/merge-store-constants.ll
@@ -5,8 +5,8 @@
 define void @big_nonzero_16_bytes(ptr nocapture %a) {
 ; X32-LABEL: big_nonzero_16_bytes:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
@@ -33,8 +33,8 @@ define void @big_nonzero_16_bytes(ptr nocapture %a) {
 define void @big_nonzero_16_bytes_big64bit_constants(ptr nocapture %a) {
 ; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,3]
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
@@ -57,8 +57,8 @@ define void @big_nonzero_16_bytes_big64bit_constants(ptr nocapture %a) {
 define void @big_nonzero_32_bytes_splat(ptr nocapture %a) {
 ; X32-LABEL: big_nonzero_32_bytes_splat:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovups %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
@@ -93,16 +93,16 @@ define void @big_nonzero_32_bytes_splat(ptr nocapture %a) {
 define void @big_nonzero_63_bytes(ptr nocapture %a) {
 ; X32-LABEL: big_nonzero_63_bytes:
 ; X32:       # %bb.0:
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [5,0,6,0]
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovups %xmm0, 32(%eax)
 ; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
 ; X32-NEXT:    vmovups %ymm0, (%eax)
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [5,0,6,0]
-; X32-NEXT:    vmovups %xmm0, 32(%eax)
 ; X32-NEXT:    movl $0, 52(%eax)
 ; X32-NEXT:    movl $7, 48(%eax)
-; X32-NEXT:    movl $8, 56(%eax)
-; X32-NEXT:    movw $9, 60(%eax)
 ; X32-NEXT:    movb $10, 62(%eax)
+; X32-NEXT:    movw $9, 60(%eax)
+; X32-NEXT:    movl $8, 56(%eax)
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll
index f2d3bc8217b1d..7546afe2479aa 100644
--- a/llvm/test/CodeGen/X86/mfence.ll
+++ b/llvm/test/CodeGen/X86/mfence.ll
@@ -21,8 +21,8 @@ define void @test() {
 define i32 @fence(ptr %ptr) {
 ; X86-LABEL: fence:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mfence
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index 4de448a099b30..4055fb154c221 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -29,25 +29,28 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind {
 ; X86-LABEL: scalar_i32_signed_reg_reg:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    setle %dl
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    subl %ebx, %esi
 ; X86-NEXT:    jg .LBB0_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    imull %esi, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %t3 = icmp sgt i32 %a1, %a2 ; signed
@@ -79,23 +82,25 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind {
 ;
 ; X86-LABEL: scalar_i32_unsigned_reg_reg:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setbe %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %esi, %ecx
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl %esi, %edi
+; X86-NEXT:    setbe %al
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    imull %edi, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
   %t3 = icmp ugt i32 %a1, %a2
   %t4 = select i1 %t3, i32 -1, i32 1
@@ -129,26 +134,29 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind {
 ; X86-LABEL: scalar_i32_signed_mem_reg:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    setle %dl
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    subl %ebx, %esi
 ; X86-NEXT:    jg .LBB2_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:  .LBB2_2:
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    imull %esi, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %a1 = load i32, ptr %a1_addr
@@ -182,26 +190,29 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i32_signed_reg_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %esi
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %esi
+; X86-NEXT:    movl (%eax), %ebx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    setle %dl
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    subl %ebx, %esi
 ; X86-NEXT:    jg .LBB3_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:  .LBB3_2:
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    imull %esi, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %a2 = load i32, ptr %a2_addr
@@ -236,27 +247,30 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i32_signed_mem_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %esi
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %esi
+; X86-NEXT:    movl (%eax), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    setle %dl
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    subl %ebx, %esi
 ; X86-NEXT:    jg .LBB4_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:  .LBB4_2:
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    imull %esi, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %a1 = load i32, ptr %a1_addr
@@ -300,32 +314,32 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    setl %bl
-; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    jl .LBB5_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB5_2:
-; X86-NEXT:    negl %ebx
+; X86-NEXT:    setl %dl
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    shrl %edi
+; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    imull %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
@@ -366,35 +380,35 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %esi, %ebp
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    shrdl $1, %ebx, %eax
+; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    orl $1, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    subl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    xorl %edx, %ebp
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    shrdl $1, %ebp, %eax
-; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    imull %edi, %ebx
 ; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    shrl %ebp
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -436,33 +450,33 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    movl 4(%eax), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    setl %bl
-; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    jl .LBB7_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB7_2:
-; X86-NEXT:    negl %ebx
+; X86-NEXT:    setl %dl
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    shrl %edi
+; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    imull %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
@@ -508,30 +522,30 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %ebp
+; X86-NEXT:    movl 4(%eax), %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    setl %bl
-; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    jl .LBB8_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB8_2:
-; X86-NEXT:    negl %ebx
+; X86-NEXT:    setl %dl
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    shrl %edi
+; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    imull %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
@@ -575,34 +589,34 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    movl 4(%ecx), %ecx
 ; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %ebp
+; X86-NEXT:    movl 4(%eax), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    movl 4(%eax), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    setl %bl
-; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    jl .LBB9_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    negl %ebx
+; X86-NEXT:    setl %dl
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    shrl %edi
+; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    negl %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    imull %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
@@ -651,24 +665,25 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
 ;
 ; X86-LABEL: scalar_i16_signed_reg_reg:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    subw %dx, %ax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    subw %dx, %si
+; X86-NEXT:    setle %dl
 ; X86-NEXT:    jg .LBB10_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %eax
+; X86-NEXT:    negl %esi
 ; X86-NEXT:  .LBB10_2:
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    leal -1(%ecx,%ecx), %ecx
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %t3 = icmp sgt i16 %a1, %a2 ; signed
   %t4 = select i1 %t3, i16 -1, i16 1
@@ -705,18 +720,19 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpw %si, %cx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    subw %si, %di
-; X86-NEXT:    setbe %dl
-; X86-NEXT:    leal -1(%edx,%edx), %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movzwl %di, %eax
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    setbe %al
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movzwl %di, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    imull %edx, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -758,25 +774,26 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
 ;
 ; X86-LABEL: scalar_i16_signed_mem_reg:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    subw %dx, %ax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    subw %dx, %si
+; X86-NEXT:    setle %dl
 ; X86-NEXT:    jg .LBB12_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %eax
+; X86-NEXT:    negl %esi
 ; X86-NEXT:  .LBB12_2:
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    leal -1(%ecx,%ecx), %ecx
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %a1 = load i16, ptr %a1_addr
   %t3 = icmp sgt i16 %a1, %a2 ; signed
@@ -812,25 +829,26 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind {
 ;
 ; X86-LABEL: scalar_i16_signed_reg_mem:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    subw %dx, %ax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    subw %dx, %si
+; X86-NEXT:    setle %dl
 ; X86-NEXT:    jg .LBB13_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %eax
+; X86-NEXT:    negl %esi
 ; X86-NEXT:  .LBB13_2:
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    leal -1(%ecx,%ecx), %ecx
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %a2 = load i16, ptr %a2_addr
   %t3 = icmp sgt i16 %a1, %a2 ; signed
@@ -866,26 +884,27 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ;
 ; X86-LABEL: scalar_i16_signed_mem_mem:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    subw %dx, %ax
-; X86-NEXT:    setle %bl
-; X86-NEXT:    leal -1(%ebx,%ebx), %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    subw %dx, %si
+; X86-NEXT:    setle %dl
 ; X86-NEXT:    jg .LBB14_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    negl %eax
+; X86-NEXT:    negl %esi
 ; X86-NEXT:  .LBB14_2:
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    movzwl %si, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    leal -1(%ecx,%ecx), %ecx
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %a1 = load i16, ptr %a1_addr
   %a2 = load i16, ptr %a2_addr
@@ -928,17 +947,18 @@ define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb %ah, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    subb %ah, %al
-; X86-NEXT:    setg %dl
 ; X86-NEXT:    jg .LBB15_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    subb %cl, %ah
-; X86-NEXT:    movb %ah, %al
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB15_2:
+; X86-NEXT:    setg %dl
+; X86-NEXT:    shrb %al
 ; X86-NEXT:    negb %dl
 ; X86-NEXT:    orb $1, %dl
-; X86-NEXT:    shrb %al
 ; X86-NEXT:    mulb %dl
 ; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    retl
@@ -972,17 +992,19 @@ define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind {
 ;
 ; X86-LABEL: scalar_i8_unsigned_reg_reg:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb %ah, %cl
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    negb %ah
-; X86-NEXT:    orb $1, %ah
+; X86-NEXT:    movb %cl, %al
+; X86-NEXT:    subb %ah, %al
+; X86-NEXT:    seta %ah
 ; X86-NEXT:    xorb %dl, %al
 ; X86-NEXT:    subb %dl, %al
 ; X86-NEXT:    shrb %al
+; X86-NEXT:    negb %ah
+; X86-NEXT:    orb $1, %ah
 ; X86-NEXT:    mulb %ah
 ; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    retl
@@ -1023,17 +1045,18 @@ define i8 @scalar_i8_signed_mem_reg(ptr %a1_addr, i8 %a2) nounwind {
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl (%ecx), %ecx
+; X86-NEXT:    movb %ah, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    subb %ah, %al
-; X86-NEXT:    setg %dl
 ; X86-NEXT:    jg .LBB17_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    subb %cl, %ah
-; X86-NEXT:    movb %ah, %al
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB17_2:
+; X86-NEXT:    setg %dl
+; X86-NEXT:    shrb %al
 ; X86-NEXT:    negb %dl
 ; X86-NEXT:    orb $1, %dl
-; X86-NEXT:    shrb %al
 ; X86-NEXT:    mulb %dl
 ; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    retl
@@ -1072,17 +1095,18 @@ define i8 @scalar_i8_signed_reg_mem(i8 %a1, ptr %a2_addr) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb (%eax), %ah
+; X86-NEXT:    movb %ah, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    subb %ah, %al
-; X86-NEXT:    setg %dl
 ; X86-NEXT:    jg .LBB18_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    subb %cl, %ah
-; X86-NEXT:    movb %ah, %al
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB18_2:
+; X86-NEXT:    setg %dl
+; X86-NEXT:    shrb %al
 ; X86-NEXT:    negb %dl
 ; X86-NEXT:    orb $1, %dl
-; X86-NEXT:    shrb %al
 ; X86-NEXT:    mulb %dl
 ; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    retl
@@ -1120,20 +1144,21 @@ define i8 @scalar_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-LABEL: scalar_i8_signed_mem_mem:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl (%ecx), %ecx
+; X86-NEXT:    movzbl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb (%eax), %ah
+; X86-NEXT:    movb %ah, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movb %cl, %al
 ; X86-NEXT:    subb %ah, %al
-; X86-NEXT:    setg %dl
 ; X86-NEXT:    jg .LBB19_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    subb %cl, %ah
-; X86-NEXT:    movb %ah, %al
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:  .LBB19_2:
+; X86-NEXT:    setg %dl
+; X86-NEXT:    shrb %al
 ; X86-NEXT:    negb %dl
 ; X86-NEXT:    orb $1, %dl
-; X86-NEXT:    shrb %al
 ; X86-NEXT:    mulb %dl
 ; X86-NEXT:    addb %cl, %al
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
index c997d314a50ae..ce68e127ed8fe 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
@@ -54,9 +54,9 @@ define dso_local void @test_sign_ext(ptr %f, ptr %i) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    sarl $31, %ecx
-; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl %eax, 8(%ecx)
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    movl %eax, 12(%ecx)
 ; CHECK-NEXT:    jmp _use_foo # TAILCALL
 ;
 ; CHECK-O0-LABEL: test_sign_ext:
@@ -171,9 +171,10 @@ entry:
 define dso_local void @test_null_arg(ptr %f) {
 ; CHECK-LABEL: test_null_arg:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    pushl $0
 ; CHECK-NEXT:    pushl $0
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    calll _test_noop2
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
@@ -199,9 +200,9 @@ define dso_local void @test_unrecognized(ptr %f, ptr addrspace(14) %i) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ecx, 8(%eax)
-; CHECK-NEXT:    sarl $31, %ecx
-; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl %eax, 8(%ecx)
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    movl %eax, 12(%ecx)
 ; CHECK-NEXT:    jmp _use_foo # TAILCALL
 ;
 ; CHECK-O0-LABEL: test_unrecognized:
@@ -359,9 +360,9 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    andl $1, %ecx
-; CHECK-NEXT:    movb %cl, (%eax)
+; CHECK-NEXT:    movb %al, (%ecx)
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-O0-LABEL: test_store_sptr32_trunc_i1:
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 8f97d2652bc53..f964a75cbe421 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -9,28 +9,28 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-LABEL: test0:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    paddb %xmm0, %xmm1
-; X86-NEXT:    movdq2q %xmm1, %mm0
 ; X86-NEXT:    movq %xmm1, (%eax)
+; X86-NEXT:    movdq2q %xmm1, %mm0
 ; X86-NEXT:    paddsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusb (%ecx), %mm0
-; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq %mm0, (%eax)
+; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubb %xmm1, %xmm0
-; X86-NEXT:    movdq2q %xmm0, %mm0
 ; X86-NEXT:    movq %xmm0, (%eax)
+; X86-NEXT:    movdq2q %xmm0, %mm0
 ; X86-NEXT:    psubsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusb (%ecx), %mm0
-; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq %mm0, (%eax)
-; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    pmullw %xmm0, %xmm1
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
@@ -143,8 +143,8 @@ define void @test1(ptr %A, ptr %B) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    paddd %xmm1, %xmm0
 ; X86-NEXT:    movq %xmm0, (%eax)
@@ -245,37 +245,37 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    paddw %xmm0, %xmm1
-; X86-NEXT:    movdq2q %xmm1, %mm0
 ; X86-NEXT:    movq %xmm1, (%eax)
+; X86-NEXT:    movdq2q %xmm1, %mm0
 ; X86-NEXT:    paddsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusw (%ecx), %mm0
-; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq %mm0, (%eax)
+; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubw %xmm1, %xmm0
-; X86-NEXT:    movdq2q %xmm0, %mm0
 ; X86-NEXT:    movq %xmm0, (%eax)
+; X86-NEXT:    movdq2q %xmm0, %mm0
 ; X86-NEXT:    psubsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusw (%ecx), %mm0
-; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq %mm0, (%eax)
+; X86-NEXT:    movq2dq %mm0, %xmm0
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    pmullw %xmm0, %xmm1
-; X86-NEXT:    movdq2q %xmm1, %mm0
 ; X86-NEXT:    movq %xmm1, (%eax)
+; X86-NEXT:    movdq2q %xmm1, %mm0
 ; X86-NEXT:    pmulhw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    pmaddwd (%ecx), %mm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    andl 4(%ecx), %esi
 ; X86-NEXT:    movd %esi, %xmm0
 ; X86-NEXT:    andl (%ecx), %edx
@@ -548,8 +548,8 @@ define void @ti64(double %a, double %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, 0
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, 4
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll
index d8a010bacc683..353ce32ef4b0b 100644
--- a/llvm/test/CodeGen/X86/mmx-build-vector.ll
+++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll
@@ -17,11 +17,11 @@ declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: build_v2i32_01:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-NEXT:    paddd %mm1, %mm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm1, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -44,9 +44,9 @@ define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind {
 define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: build_v2i32_0z:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -67,19 +67,19 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind {
 define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X86-MMX-LABEL: build_v2i32_u1:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
 ; X86-MMX-NEXT:    paddd %mm0, %mm0
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm0, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2i32_u1:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-SSE-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -101,11 +101,11 @@ define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: build_v2i32_z1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    pxor %mm1, %mm1
 ; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-NEXT:    paddd %mm1, %mm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm1, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -128,19 +128,19 @@ define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X86-MMX-LABEL: build_v2i32_00:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
 ; X86-MMX-NEXT:    paddd %mm0, %mm0
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm0, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2i32_00:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-SSE-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -166,7 +166,6 @@ define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind {
 define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
 ; X86-LABEL: build_v4i16_0123:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
@@ -175,6 +174,7 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 ; X86-NEXT:    punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1]
 ; X86-NEXT:    punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0]
 ; X86-NEXT:    paddd %mm2, %mm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm2, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -203,7 +203,6 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
 ; X86-LABEL: build_v4i16_01zz:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
@@ -211,6 +210,7 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 ; X86-NEXT:    punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
 ; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-NEXT:    paddd %mm1, %mm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm1, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -238,9 +238,9 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
 ; X86-LABEL: build_v4i16_0uuz:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -265,8 +265,8 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd %eax, %mm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -290,7 +290,6 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
 ; X86-LABEL: build_v4i16_012u:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
@@ -298,6 +297,7 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 ; X86-NEXT:    punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
 ; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
 ; X86-NEXT:    paddd %mm2, %mm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm2, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -325,20 +325,20 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
 ; X86-MMX-LABEL: build_v4i16_0u00:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
 ; X86-MMX-NEXT:    paddd %mm0, %mm0
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm0, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v4i16_0u00:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-SSE-NEXT:    pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -366,7 +366,6 @@ define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
 define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
 ; X86-LABEL: build_v8i8_01234567:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
@@ -383,6 +382,7 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 ; X86-NEXT:    punpcklwd %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1]
 ; X86-NEXT:    punpckldq %mm2, %mm3 # mm3 = mm3[0],mm2[0]
 ; X86-NEXT:    paddd %mm3, %mm3
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm3, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -423,7 +423,6 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
 ; X86-LABEL: build_v8i8_0u2345z7:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    pxor %mm1, %mm1
 ; X86-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
@@ -439,6 +438,7 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 ; X86-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 ; X86-NEXT:    punpckldq %mm2, %mm0 # mm0 = mm0[0],mm2[0]
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -478,22 +478,22 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
 ; X86-LABEL: build_v8i8_0123zzzu:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
-; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm2
-; X86-NEXT:    punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
-; X86-NEXT:    punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
-; X86-NEXT:    pxor %mm0, %mm0
+; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT:    punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3]
+; X86-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 ; X86-NEXT:    pxor %mm1, %mm1
+; X86-NEXT:    pxor %mm2, %mm2
+; X86-NEXT:    punpcklbw %mm2, %mm2 # mm2 = mm2[0,0,1,1,2,2,3,3]
 ; X86-NEXT:    punpcklbw %mm1, %mm1 # mm1 = mm1[0,0,1,1,2,2,3,3]
-; X86-NEXT:    punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
-; X86-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    paddd %mm2, %mm2
-; X86-NEXT:    movq %mm2, (%eax)
+; X86-NEXT:    punpcklwd %mm2, %mm1 # mm1 = mm1[0],mm2[0],mm1[1],mm2[1]
+; X86-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: build_v8i8_0123zzzu:
@@ -531,9 +531,9 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
 ; X86-LABEL: build_v8i8_0uuuuzzz:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -562,8 +562,8 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd %eax, %mm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -591,22 +591,22 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
 ; X86-MMX-LABEL: build_v8i8_00000000:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
 ; X86-MMX-NEXT:    punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
 ; X86-MMX-NEXT:    paddd %mm0, %mm0
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm0, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v8i8_00000000:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-SSE-NEXT:    punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
 ; X86-SSE-NEXT:    pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -639,23 +639,23 @@ define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
 define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind {
 ; X86-MMX-LABEL: build_v2f32_01:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-MMX-NEXT:    paddd %mm1, %mm1
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm1, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2f32_01:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm0
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm1
 ; X86-SSE-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-SSE-NEXT:    paddd %mm1, %mm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm1, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -678,22 +678,22 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind {
 define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind {
 ; X86-MMX-LABEL: build_v2f32_0z:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    pxor %mm0, %mm0
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm1
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-MMX-NEXT:    paddd %mm1, %mm1
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm1, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2f32_0z:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm0
 ; X86-SSE-NEXT:    pxor %mm1, %mm1
 ; X86-SSE-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -716,20 +716,20 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind {
 define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind {
 ; X86-MMX-LABEL: build_v2f32_u1:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
 ; X86-MMX-NEXT:    paddd %mm0, %mm0
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm0, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2f32_u1:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm0
 ; X86-SSE-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -751,22 +751,22 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind {
 define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind {
 ; X86-MMX-LABEL: build_v2f32_z1:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    pxor %mm1, %mm1
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-MMX-NEXT:    paddd %mm1, %mm1
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm1, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2f32_z1:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm0
 ; X86-SSE-NEXT:    pxor %mm1, %mm1
 ; X86-SSE-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-SSE-NEXT:    paddd %mm1, %mm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm1, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -789,20 +789,20 @@ define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind {
 define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind {
 ; X86-MMX-LABEL: build_v2f32_00:
 ; X86-MMX:       # %bb.0:
-; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movd {{[0-9]+}}(%esp), %mm0
 ; X86-MMX-NEXT:    punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
 ; X86-MMX-NEXT:    paddd %mm0, %mm0
+; X86-MMX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-MMX-NEXT:    movq %mm0, (%eax)
 ; X86-MMX-NEXT:    retl
 ;
 ; X86-SSE-LABEL: build_v2f32_00:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm0
 ; X86-SSE-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-SSE-NEXT:    paddd %mm0, %mm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movq %mm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index 07e56a9148cf0..db9727d6affa7 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -8,9 +8,9 @@
 define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -34,9 +34,9 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -60,9 +60,9 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -84,9 +84,9 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -110,9 +110,9 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -136,9 +136,9 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v4f32_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -161,9 +161,9 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 6fe3bc4973185..440955120bfab 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -10,11 +10,11 @@ define i64 @t0(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psllq %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psllq %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -45,11 +45,11 @@ define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psrlq %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psrlq %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -80,11 +80,11 @@ define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psllw %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psllw %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -115,11 +115,11 @@ define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psrlw %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psrlw %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -150,11 +150,11 @@ define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    pslld %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    pslld %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -185,11 +185,11 @@ define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psrld %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psrld %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -220,11 +220,11 @@ define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psraw %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psraw %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -255,11 +255,11 @@ define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
-; X86-NEXT:    movd (%eax), %mm1
-; X86-NEXT:    psrad %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movd (%eax), %mm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm1
+; X86-NEXT:    psrad %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ebp, %esp
@@ -297,9 +297,9 @@ define i64 @tt0(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -336,9 +336,9 @@ define i64 @tt1(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -374,9 +374,9 @@ define i64 @tt2(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -412,9 +412,9 @@ define i64 @tt3(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -450,9 +450,9 @@ define i64 @tt4(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -488,9 +488,9 @@ define i64 @tt5(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -526,9 +526,9 @@ define i64 @tt6(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -564,9 +564,9 @@ define i64 @tt7(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrld (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -602,9 +602,9 @@ define i64 @tt8(<1 x i64> %t, ptr %q) nounwind {
 ; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    emms
 ; X86-NEXT:    movl (%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    emms
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -629,12 +629,12 @@ define void @test_psrlq_by_volatile_shift_amount(ptr %t) nounwind {
 ; X86-LABEL: test_psrlq_by_volatile_shift_amount:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $1, (%esp)
-; X86-NEXT:    movl $255, %ecx
-; X86-NEXT:    movd %ecx, %mm0
+; X86-NEXT:    movl $255, %eax
+; X86-NEXT:    movd %eax, %mm0
 ; X86-NEXT:    movd (%esp), %mm1
 ; X86-NEXT:    psrlq %mm1, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
index a3ef81daef337..1476d280c4e2d 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
@@ -11,9 +11,9 @@ define double @mmx_zero(double, double, double, double) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movq 8(%ebp), %mm0
 ; X86-NEXT:    movq 16(%ebp), %mm5
 ; X86-NEXT:    movq %mm5, (%esp) # 8-byte Spill
+; X86-NEXT:    movq 8(%ebp), %mm0
 ; X86-NEXT:    movq %mm0, %mm3
 ; X86-NEXT:    paddd %mm5, %mm3
 ; X86-NEXT:    pxor %mm1, %mm1
diff --git a/llvm/test/CodeGen/X86/movtopush.ll b/llvm/test/CodeGen/X86/movtopush.ll
index 22929fa4b8a17..57944b282ecbe 100644
--- a/llvm/test/CodeGen/X86/movtopush.ll
+++ b/llvm/test/CodeGen/X86/movtopush.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
 ; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
 ; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
@@ -45,6 +46,60 @@ declare void @llvm.stackrestore(ptr)
 ; NOPUSH-NEXT: call
 ; NOPUSH-NEXT: addl $16, %esp
 define void @test1() {
+; NORMAL-LABEL: test1:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test1:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test1:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   call void @good(i32 1, i32 2, i32 3, i32 4)
   ret void
@@ -59,6 +114,88 @@ entry:
 ; NORMAL-NEXT: pushl   $1
 ; NORMAL-NEXT: call
 define void @test2(i32 %k) {
+; NORMAL-LABEL: test2:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl %ebp
+; NORMAL-NEXT:    movl %esp, %ebp
+; NORMAL-NEXT:    movl 8(%ebp), %eax
+; NORMAL-NEXT:    shll $2, %eax
+; NORMAL-NEXT:    calll __chkstk
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    movl %ebp, %esp
+; NORMAL-NEXT:    popl %ebp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test2:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    pushl %ebp
+; NOPUSH-NEXT:    movl %esp, %ebp
+; NOPUSH-NEXT:    movl 8(%ebp), %eax
+; NOPUSH-NEXT:    shll $2, %eax
+; NOPUSH-NEXT:    calll __chkstk
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    movl %ebp, %esp
+; NOPUSH-NEXT:    popl %ebp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:    .seh_setframe %rbp, 0
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    leaq 15(,%rax,4), %rax
+; X64-NEXT:    andq $-16, %rax
+; X64-NEXT:    callq __chkstk
+; X64-NEXT:    subq %rax, %rsp
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test2:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    pushl %ebp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    .cfi_offset %ebp, -8
+; LINUX-NEXT:    movl %esp, %ebp
+; LINUX-NEXT:    .cfi_def_cfa_register %ebp
+; LINUX-NEXT:    subl $8, %esp
+; LINUX-NEXT:    movl %esp, %eax
+; LINUX-NEXT:    movl 8(%ebp), %ecx
+; LINUX-NEXT:    leal 15(,%ecx,4), %ecx
+; LINUX-NEXT:    andl $-16, %ecx
+; LINUX-NEXT:    subl %ecx, %eax
+; LINUX-NEXT:    movl %eax, %esp
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    movl %ebp, %esp
+; LINUX-NEXT:    popl %ebp
+; LINUX-NEXT:    .cfi_def_cfa %esp, 4
+; LINUX-NEXT:    retl
 entry:
   %a = alloca i32, i32 %k
   call void @good(i32 1, i32 2, i32 3, i32 4)
@@ -76,6 +213,60 @@ entry:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $16, %esp
 define void @test2b() optsize {
+; NORMAL-LABEL: test2b:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl $4096 # imm = 0x1000
+; NORMAL-NEXT:    pushl $3072 # imm = 0xC00
+; NORMAL-NEXT:    pushl $2048 # imm = 0x800
+; NORMAL-NEXT:    pushl $1024 # imm = 0x400
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test2b:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl $4096, {{[0-9]+}}(%esp) # imm = 0x1000
+; NOPUSH-NEXT:    movl $3072, {{[0-9]+}}(%esp) # imm = 0xC00
+; NOPUSH-NEXT:    movl $2048, {{[0-9]+}}(%esp) # imm = 0x800
+; NOPUSH-NEXT:    movl $1024, (%esp) # imm = 0x400
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test2b:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl $1024, %ecx # imm = 0x400
+; X64-NEXT:    movl $2048, %edx # imm = 0x800
+; X64-NEXT:    movl $3072, %r8d # imm = 0xC00
+; X64-NEXT:    movl $4096, %r9d # imm = 0x1000
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test2b:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    pushl $4096 # imm = 0x1000
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3072 # imm = 0xC00
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2048 # imm = 0x800
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1024 # imm = 0x400
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
   ret void
@@ -91,6 +282,66 @@ entry:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $16, %esp
 define void @test3(i32 %k) optsize {
+; NORMAL-LABEL: test3:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    incl %eax
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test3:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    incl %eax
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test3:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    incl %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test3:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    incl %eax
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   %f = add i32 %k, 1
   call void @good(i32 %f, i32 2, i32 3, i32 4)
@@ -106,6 +357,59 @@ entry:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $12, %esp
 define void @test4() optsize {
+; NORMAL-LABEL: test4:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl $2, %eax
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _inreg
+; NORMAL-NEXT:    addl $12, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test4:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $12, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    movl $2, %eax
+; NOPUSH-NEXT:    calll _inreg
+; NOPUSH-NEXT:    addl $12, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test4:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq inreg
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test4:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $16, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 16
+; LINUX-NEXT:    movl $2, %eax
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll inreg at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   call void @inreg(i32 1, i32 inreg 2, i32 3, i32 4)
   ret void
@@ -120,6 +424,45 @@ entry:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: ret
 define void @test4b(ptr %f) optsize {
+; NORMAL-LABEL: test4b:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _thiscall
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test4b:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOPUSH-NEXT:    calll _thiscall
+; NOPUSH-NEXT:    retl
+;
+; LINUX-LABEL: test4b:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll thiscall at PLT
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -16
+; LINUX-NEXT:    addl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 4
+; LINUX-NEXT:    retl
 entry:
   call x86_thiscallcc void @thiscall(ptr %f, i32 1, i32 2, i32 3, i32 4)
   ret void
@@ -135,6 +478,50 @@ declare void @f(ptr)
 @ext = external dso_local constant i8
 
 define void @test6() {
+; NORMAL-LABEL: test6:
+; NORMAL:       # %bb.0:
+; NORMAL-NEXT:    pushl %ebp
+; NORMAL-NEXT:    movl %esp, %ebp
+; NORMAL-NEXT:    pushl $_ext
+; NORMAL-NEXT:    calll _f
+; NORMAL-NEXT:    addl $4, %esp
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    movl %ebp, %esp
+; NORMAL-NEXT:    popl %ebp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test6:
+; NOPUSH:       # %bb.0:
+; NOPUSH-NEXT:    pushl %ebp
+; NOPUSH-NEXT:    movl %esp, %ebp
+; NOPUSH-NEXT:    subl $4, %esp
+; NOPUSH-NEXT:    movl $_ext, (%esp)
+; NOPUSH-NEXT:    calll _f
+; NOPUSH-NEXT:    addl $4, %esp
+; NOPUSH-NEXT:    pushl %eax
+; NOPUSH-NEXT:    movl %ebp, %esp
+; NOPUSH-NEXT:    popl %ebp
+; NOPUSH-NEXT:    retl
+;
+; LINUX-LABEL: test6:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushl %ebp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    .cfi_offset %ebp, -8
+; LINUX-NEXT:    movl %esp, %ebp
+; LINUX-NEXT:    .cfi_def_cfa_register %ebp
+; LINUX-NEXT:    subl $8, %esp
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    pushl $ext
+; LINUX-NEXT:    calll f at PLT
+; LINUX-NEXT:    addl $16, %esp
+; LINUX-NEXT:    movl %esp, %eax
+; LINUX-NEXT:    addl $-16, %eax
+; LINUX-NEXT:    movl %eax, %esp
+; LINUX-NEXT:    movl %ebp, %esp
+; LINUX-NEXT:    popl %ebp
+; LINUX-NEXT:    .cfi_def_cfa %esp, 4
+; LINUX-NEXT:    retl
   call void @f(ptr @ext)
   br label %bb
 bb:
@@ -153,6 +540,64 @@ bb:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $16, %esp
 define void @test7(ptr %ptr) optsize {
+; NORMAL-LABEL: test7:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl (%eax)
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test7:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl (%eax), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test7:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl (%rcx), %r8d
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test7:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl (%eax)
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   %val = load i32, ptr %ptr
   call void @good(i32 1, i32 2, i32 %val, i32 4)
@@ -171,6 +616,66 @@ entry:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $16, %esp
 define void @test8(i32 %a, i32 %b) optsize {
+; NORMAL-LABEL: test8:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test8:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl %edx, %r8d
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test8:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   call void @good(i32 1, i32 %a, i32 %b, i32 4)
   ret void
@@ -199,6 +704,124 @@ entry:
 ; NORMAL-NEXT: call
 ; NORMAL-NEXT: addl $20, %esp
 define void @test9() optsize {
+; NORMAL-LABEL: test9:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl %ebp
+; NORMAL-NEXT:    movl %esp, %ebp
+; NORMAL-NEXT:    pushl %esi
+; NORMAL-NEXT:    andl $-8, %esp
+; NORMAL-NEXT:    subl $24, %esp
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    movl (%esp), %eax
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; NORMAL-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; NORMAL-NEXT:    pushl %edx
+; NORMAL-NEXT:    pushl %esi
+; NORMAL-NEXT:    pushl $6
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _struct
+; NORMAL-NEXT:    addl $20, %esp
+; NORMAL-NEXT:    leal -4(%ebp), %esp
+; NORMAL-NEXT:    popl %esi
+; NORMAL-NEXT:    popl %ebp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test9:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    pushl %ebp
+; NOPUSH-NEXT:    movl %esp, %ebp
+; NOPUSH-NEXT:    andl $-8, %esp
+; NOPUSH-NEXT:    subl $40, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $6, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _struct
+; NOPUSH-NEXT:    movl %ebp, %esp
+; NOPUSH-NEXT:    popl %ebp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test9:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $72, %rsp
+; X64-NEXT:    .seh_stackalloc 72
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rax, (%rcx)
+; X64-NEXT:    movl $6, %edx
+; X64-NEXT:    # kill: def $r8d killed $r8d killed $r8
+; X64-NEXT:    # kill: def $r9d killed $r9d killed $r9
+; X64-NEXT:    callq struct
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $72, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test9:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    pushl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    subl $24, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-NEXT:    .cfi_offset %esi, -8
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $4, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -4
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; LINUX-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; LINUX-NEXT:    pushl %edx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %esi
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $6
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll struct at PLT
+; LINUX-NEXT:    addl $56, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -56
+; LINUX-NEXT:    popl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 4
+; LINUX-NEXT:    retl
 entry:
   %p = alloca i32, align 4
   %q = alloca i32, align 4
@@ -225,6 +848,98 @@ entry:
 ; NORMAL-NEXT: calll *16(%esp)
 ; NORMAL-NEXT: addl $24, %esp
 define void @test10() optsize {
+; NORMAL-LABEL: test10:
+; NORMAL:       # %bb.0:
+; NORMAL-NEXT:    pushl %ebp
+; NORMAL-NEXT:    pushl %ebx
+; NORMAL-NEXT:    pushl %edi
+; NORMAL-NEXT:    pushl %esi
+; NORMAL-NEXT:    subl $8, %esp
+; NORMAL-NEXT:    movl $_good, {{[0-9]+}}(%esp)
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; NORMAL-NEXT:    #APP
+; NORMAL-NEXT:    nop
+; NORMAL-NEXT:    #NO_APP
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll *{{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; NORMAL-NEXT:    addl $24, %esp
+; NORMAL-NEXT:    popl %esi
+; NORMAL-NEXT:    popl %edi
+; NORMAL-NEXT:    popl %ebx
+; NORMAL-NEXT:    popl %ebp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test10:
+; NOPUSH:       # %bb.0:
+; NOPUSH-NEXT:    pushl %ebp
+; NOPUSH-NEXT:    pushl %ebx
+; NOPUSH-NEXT:    pushl %edi
+; NOPUSH-NEXT:    pushl %esi
+; NOPUSH-NEXT:    subl $24, %esp
+; NOPUSH-NEXT:    movl $_good, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; NOPUSH-NEXT:    #APP
+; NOPUSH-NEXT:    nop
+; NOPUSH-NEXT:    #NO_APP
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll *{{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; NOPUSH-NEXT:    addl $24, %esp
+; NOPUSH-NEXT:    popl %esi
+; NOPUSH-NEXT:    popl %edi
+; NOPUSH-NEXT:    popl %ebx
+; NOPUSH-NEXT:    popl %ebp
+; NOPUSH-NEXT:    retl
+;
+; LINUX-LABEL: test10:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushl %ebp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    pushl %ebx
+; LINUX-NEXT:    .cfi_def_cfa_offset 12
+; LINUX-NEXT:    pushl %edi
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    pushl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 20
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-NEXT:    .cfi_offset %esi, -20
+; LINUX-NEXT:    .cfi_offset %edi, -16
+; LINUX-NEXT:    .cfi_offset %ebx, -12
+; LINUX-NEXT:    .cfi_offset %ebp, -8
+; LINUX-NEXT:    movl $good, {{[0-9]+}}(%esp)
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; LINUX-NEXT:    #APP
+; LINUX-NEXT:    nop
+; LINUX-NEXT:    #NO_APP
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll *{{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    popl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    popl %edi
+; LINUX-NEXT:    .cfi_def_cfa_offset 12
+; LINUX-NEXT:    popl %ebx
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    popl %ebp
+; LINUX-NEXT:    .cfi_def_cfa_offset 4
+; LINUX-NEXT:    retl
   %stack_fptr = alloca ptr
   store ptr @good, ptr %stack_fptr
   %good_ptr = load volatile ptr, ptr %stack_fptr
@@ -246,6 +961,67 @@ define void @test10() optsize {
 ; NORMAL-NEXT: addl $16, %esp
 @the_global = external dso_local global i32
 define void @test11() optsize {
+; NORMAL-LABEL: test11:
+; NORMAL:       # %bb.0:
+; NORMAL-NEXT:    movl _the_global, %eax
+; NORMAL-NEXT:    movl $42, _the_global
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test11:
+; NOPUSH:       # %bb.0:
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl _the_global, %eax
+; NOPUSH-NEXT:    movl $42, _the_global
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test11:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    .seh_stackalloc 40
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl the_global(%rip), %ecx
+; X64-NEXT:    movl $42, the_global(%rip)
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test11:
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movl the_global, %eax
+; LINUX-NEXT:    movl $42, the_global
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
   %myload = load i32, ptr @the_global
   store i32 42, ptr @the_global
   call void @good(i32 %myload, i32 2, i32 3, i32 4)
@@ -261,6 +1037,139 @@ define void @test11() optsize {
 ; NORMAL-NEXT:  pushl  $5
 ; NORMAL-NEXT: calll _good
 define void @test12() optsize {
+; NORMAL-LABEL: test12:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    subl $8, %esp
+; NORMAL-NEXT:    movl (%esp), %eax
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _struct
+; NORMAL-NEXT:    addl $20, %esp
+; NORMAL-NEXT:    pushl $8
+; NORMAL-NEXT:    pushl $7
+; NORMAL-NEXT:    pushl $6
+; NORMAL-NEXT:    pushl $5
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    movl (%esp), %eax
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    pushl $12
+; NORMAL-NEXT:    pushl $11
+; NORMAL-NEXT:    pushl $10
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _struct
+; NORMAL-NEXT:    addl $28, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test12:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $28, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _struct
+; NOPUSH-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $7, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $6, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $5, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $12, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $11, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $10, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _struct
+; NOPUSH-NEXT:    addl $28, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test12:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $72, %rsp
+; X64-NEXT:    .seh_stackalloc 72
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rax, (%rcx)
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq struct
+; X64-NEXT:    movl $5, %ecx
+; X64-NEXT:    movl $6, %edx
+; X64-NEXT:    movl $7, %r8d
+; X64-NEXT:    movl $8, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rax, (%rcx)
+; X64-NEXT:    movl $10, %edx
+; X64-NEXT:    movl $11, %r8d
+; X64-NEXT:    movl $12, %r9d
+; X64-NEXT:    callq struct
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $72, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test12:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $24, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 24
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll struct at PLT
+; LINUX-NEXT:    addl $32, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -32
+; LINUX-NEXT:    pushl $8
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $7
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $6
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $5
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $4, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -4
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    pushl $12
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $11
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $10
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll struct at PLT
+; LINUX-NEXT:    addl $44, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -44
+; LINUX-NEXT:    retl
 entry:
   %s = alloca %struct.s, align 4
   call void @struct(ptr byval(%struct.s) %s, i32 2, i32 3, i32 4)
@@ -293,6 +1202,127 @@ entry:
 ; NORMAL=NEXT: calll  _good
 ; NORMAL=NEXT: addl  $16, %esp
 define void @test12b() optsize {
+; NORMAL-LABEL: test12b:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    subl $8, %esp
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    movl (%esp), %eax
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    pushl $8
+; NORMAL-NEXT:    pushl $7
+; NORMAL-NEXT:    pushl $6
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _struct
+; NORMAL-NEXT:    addl $20, %esp
+; NORMAL-NEXT:    pushl $12
+; NORMAL-NEXT:    pushl $11
+; NORMAL-NEXT:    pushl $10
+; NORMAL-NEXT:    pushl $9
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $24, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test12b:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $28, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $8, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $7, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $6, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _struct
+; NOPUSH-NEXT:    movl $12, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $11, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $10, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $9, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $28, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test12b:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $56, %rsp
+; X64-NEXT:    .seh_stackalloc 56
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rax, (%rcx)
+; X64-NEXT:    movl $6, %edx
+; X64-NEXT:    movl $7, %r8d
+; X64-NEXT:    movl $8, %r9d
+; X64-NEXT:    callq struct
+; X64-NEXT:    movl $9, %ecx
+; X64-NEXT:    movl $10, %edx
+; X64-NEXT:    movl $11, %r8d
+; X64-NEXT:    movl $12, %r9d
+; X64-NEXT:    callq good
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $56, %rsp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test12b:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $4, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -4
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    pushl $8
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $7
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $6
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll struct at PLT
+; LINUX-NEXT:    addl $32, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -32
+; LINUX-NEXT:    pushl $12
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $11
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $10
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $9
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   %s = alloca %struct.s, align 4
   call void @good(i32 1, i32 2, i32 3, i32 4)
@@ -315,6 +1345,95 @@ entry:
 ; NORMAL-NEXT: calll _good
 ; NORMAL: movl [[P3]], %eax
 define ptr @test13(ptr inreg %ptr1, ptr inreg %ptr2, ptr inreg %ptr3) optsize {
+; NORMAL-LABEL: test13:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl %esi
+; NORMAL-NEXT:    movl %ecx, %esi
+; NORMAL-NEXT:    movl (%edx), %ecx
+; NORMAL-NEXT:    movl (%eax), %eax
+; NORMAL-NEXT:    leal (%eax,%ecx), %edx
+; NORMAL-NEXT:    pushl %edx
+; NORMAL-NEXT:    pushl (%esi)
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    movl %esi, %eax
+; NORMAL-NEXT:    popl %esi
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test13:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    pushl %edi
+; NOPUSH-NEXT:    pushl %esi
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl %ecx, %esi
+; NOPUSH-NEXT:    movl (%edx), %ecx
+; NOPUSH-NEXT:    movl (%eax), %eax
+; NOPUSH-NEXT:    leal (%eax,%ecx), %edx
+; NOPUSH-NEXT:    movl (%esi), %edi
+; NOPUSH-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    movl %esi, %eax
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    popl %esi
+; NOPUSH-NEXT:    popl %edi
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test13:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rsi
+; X64-NEXT:    .seh_pushreg %rsi
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    .seh_stackalloc 32
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movq %r8, %rsi
+; X64-NEXT:    movl (%rcx), %ecx
+; X64-NEXT:    movl (%rdx), %edx
+; X64-NEXT:    movl (%r8), %r8d
+; X64-NEXT:    leal (%rcx,%rdx), %r9d
+; X64-NEXT:    # kill: def $ecx killed $ecx killed $rcx
+; X64-NEXT:    # kill: def $edx killed $edx killed $rdx
+; X64-NEXT:    callq good
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    popq %rsi
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test13:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    pushl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    subl $8, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    .cfi_offset %esi, -8
+; LINUX-NEXT:    movl %ecx, %esi
+; LINUX-NEXT:    movl (%edx), %ecx
+; LINUX-NEXT:    movl (%eax), %eax
+; LINUX-NEXT:    leal (%eax,%ecx), %edx
+; LINUX-NEXT:    pushl %edx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl (%esi)
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $16, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -16
+; LINUX-NEXT:    movl %esi, %eax
+; LINUX-NEXT:    addl $8, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    popl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 4
+; LINUX-NEXT:    retl
 entry:
   %val1 = load i32, ptr %ptr1
   %val2 = load i32, ptr %ptr2
@@ -343,6 +1462,51 @@ entry:
 ; LINUX-NOT: add
 ; LINUX: retl
 define void @pr27140() optsize {
+; NORMAL-LABEL: pr27140:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl $4
+; NORMAL-NEXT:    pushl $3
+; NORMAL-NEXT:    pushl $2
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    calll _good
+; NORMAL-NEXT:    addl $16, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: pr27140:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $16, %esp
+; NOPUSH-NEXT:    movl $4, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $3, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $2, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl $1, (%esp)
+; NOPUSH-NEXT:    calll _good
+; NOPUSH-NEXT:    addl $16, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: pr27140:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    movl $3, %r8d
+; X64-NEXT:    movl $4, %r9d
+; X64-NEXT:    jmp good # TAILCALL
+;
+; LINUX-LABEL: pr27140:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    pushl $4
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $3
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $2
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll good at PLT
+; LINUX-NEXT:    addl $28, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -28
+; LINUX-NEXT:    retl
 entry:
   tail call void @good(i32 1, i32 2, i32 3, i32 4)
   ret void
@@ -361,6 +1525,129 @@ entry:
 declare x86_thiscallcc ptr @B_ctor(ptr returned, ptr byval(%struct.A))
 declare void @B_func(ptr sret(%struct.B), ptr, i32)
 define void @test14(ptr %a) {
+; NORMAL-LABEL: test14:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    pushl %ebp
+; NORMAL-NEXT:    movl %esp, %ebp
+; NORMAL-NEXT:    pushl %esi
+; NORMAL-NEXT:    andl $-8, %esp
+; NORMAL-NEXT:    subl $24, %esp
+; NORMAL-NEXT:    movl 8(%ebp), %eax
+; NORMAL-NEXT:    movl (%eax), %edx
+; NORMAL-NEXT:    movl 4(%eax), %eax
+; NORMAL-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NORMAL-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; NORMAL-NEXT:    movl %esi, %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %edx
+; NORMAL-NEXT:    calll _B_ctor
+; NORMAL-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    pushl $1
+; NORMAL-NEXT:    pushl %esi
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _B_func
+; NORMAL-NEXT:    addl $12, %esp
+; NORMAL-NEXT:    leal -4(%ebp), %esp
+; NORMAL-NEXT:    popl %esi
+; NORMAL-NEXT:    popl %ebp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: test14:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    pushl %ebp
+; NOPUSH-NEXT:    movl %esp, %ebp
+; NOPUSH-NEXT:    pushl %esi
+; NOPUSH-NEXT:    andl $-8, %esp
+; NOPUSH-NEXT:    subl $32, %esp
+; NOPUSH-NEXT:    movl 8(%ebp), %eax
+; NOPUSH-NEXT:    movl (%eax), %ecx
+; NOPUSH-NEXT:    movl 4(%eax), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, (%esp)
+; NOPUSH-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; NOPUSH-NEXT:    movl %esi, %ecx
+; NOPUSH-NEXT:    calll _B_ctor
+; NOPUSH-NEXT:    subl $8, %esp
+; NOPUSH-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _B_func
+; NOPUSH-NEXT:    leal -4(%ebp), %esp
+; NOPUSH-NEXT:    popl %esi
+; NOPUSH-NEXT:    popl %ebp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: test14:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rsi
+; X64-NEXT:    .seh_pushreg %rsi
+; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    .seh_stackalloc 64
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movq (%rcx), %rax
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    callq B_ctor
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    movl $1, %r8d
+; X64-NEXT:    callq B_func
+; X64-NEXT:    nop
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $64, %rsp
+; X64-NEXT:    popq %rsi
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_endproc
+;
+; LINUX-LABEL: test14:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    pushl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    subl $24, %esp
+; LINUX-NEXT:    .cfi_def_cfa_offset 32
+; LINUX-NEXT:    .cfi_offset %esi, -8
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    movl (%eax), %edx
+; LINUX-NEXT:    movl 4(%eax), %eax
+; LINUX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; LINUX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; LINUX-NEXT:    subl $8, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 8
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; LINUX-NEXT:    movl %esi, %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %edx
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll B_ctor at PLT
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -8
+; LINUX-NEXT:    addl $4, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -4
+; LINUX-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    pushl $1
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %esi
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    .cfi_adjust_cfa_offset 4
+; LINUX-NEXT:    calll B_func at PLT
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -4
+; LINUX-NEXT:    addl $36, %esp
+; LINUX-NEXT:    .cfi_adjust_cfa_offset -36
+; LINUX-NEXT:    popl %esi
+; LINUX-NEXT:    .cfi_def_cfa_offset 4
+; LINUX-NEXT:    retl
 entry:
   %ref.tmp = alloca %struct.B, align 1
   %agg.tmp = alloca i64, align 8
@@ -399,6 +1686,66 @@ entry:
 ; NOPUSH-NEXT:  calll  _eightparams16
 ; NOPUSH-NEXT:   addl  $32, %esp
 define void @pr34863_16(i16 %x) minsize nounwind {
+; NORMAL-LABEL: pr34863_16:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    pushl $65535 # imm = 0xFFFF
+; NORMAL-NEXT:    pushl $0
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _eightparams16
+; NORMAL-NEXT:    addl $32, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: pr34863_16:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $32, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    movl $65535, {{[0-9]+}}(%esp) # imm = 0xFFFF
+; NOPUSH-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _eightparams16
+; NOPUSH-NEXT:    addl $32, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: pr34863_16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $72, %rsp
+; X64-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    orw $-1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    andw $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    movl %ecx, %r8d
+; X64-NEXT:    movl %ecx, %r9d
+; X64-NEXT:    callq eightparams16
+; X64-NEXT:    addq $72, %rsp
+; X64-NEXT:    retq
+;
+; LINUX-LABEL: pr34863_16:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    pushl $65535 # imm = 0xFFFF
+; LINUX-NEXT:    pushl $0
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    calll eightparams16 at PLT
+; LINUX-NEXT:    addl $44, %esp
+; LINUX-NEXT:    retl
 entry:
   tail call void @eightparams16(i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 0, i16 -1)
   ret void
@@ -431,6 +1778,66 @@ entry:
 ; NOPUSH-NEXT: calll  _eightparams
 ; NOPUSH-NEXT: addl  $32, %esp
 define void @pr34863_32(i32 %x) minsize nounwind {
+; NORMAL-LABEL: pr34863_32:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    pushl $-1
+; NORMAL-NEXT:    pushl $0
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _eightparams
+; NORMAL-NEXT:    addl $32, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: pr34863_32:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $32, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, (%esp)
+; NOPUSH-NEXT:    orl $-1, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _eightparams
+; NOPUSH-NEXT:    addl $32, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: pr34863_32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $72, %rsp
+; X64-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    orl $-1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    movl %ecx, %r8d
+; X64-NEXT:    movl %ecx, %r9d
+; X64-NEXT:    callq eightparams
+; X64-NEXT:    addq $72, %rsp
+; X64-NEXT:    retq
+;
+; LINUX-LABEL: pr34863_32:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    pushl $-1
+; LINUX-NEXT:    pushl $0
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    calll eightparams at PLT
+; LINUX-NEXT:    addl $44, %esp
+; LINUX-NEXT:    retl
 entry:
   tail call void @eightparams(i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 0, i32 -1)
   ret void
@@ -481,6 +1888,93 @@ entry:
 ; NOPUSH-NEXT: calll  _eightparams64
 ; NOPUSH-NEXT: addl  $64, %esp
 define void @pr34863_64(i64 %x) minsize nounwind {
+; NORMAL-LABEL: pr34863_64:
+; NORMAL:       # %bb.0: # %entry
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NORMAL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NORMAL-NEXT:    pushl $-1
+; NORMAL-NEXT:    pushl $-1
+; NORMAL-NEXT:    pushl $0
+; NORMAL-NEXT:    pushl $0
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    pushl %ecx
+; NORMAL-NEXT:    pushl %eax
+; NORMAL-NEXT:    calll _eightparams64
+; NORMAL-NEXT:    addl $64, %esp
+; NORMAL-NEXT:    retl
+;
+; NOPUSH-LABEL: pr34863_64:
+; NOPUSH:       # %bb.0: # %entry
+; NOPUSH-NEXT:    subl $64, %esp
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    movl %ecx, (%esp)
+; NOPUSH-NEXT:    orl $-1, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    orl $-1, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    andl $0, {{[0-9]+}}(%esp)
+; NOPUSH-NEXT:    calll _eightparams64
+; NOPUSH-NEXT:    addl $64, %esp
+; NOPUSH-NEXT:    retl
+;
+; X64-LABEL: pr34863_64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $72, %rsp
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    orq $-1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    andq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    callq eightparams64
+; X64-NEXT:    addq $72, %rsp
+; X64-NEXT:    retq
+;
+; LINUX-LABEL: pr34863_64:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    subl $12, %esp
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; LINUX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; LINUX-NEXT:    pushl $-1
+; LINUX-NEXT:    pushl $-1
+; LINUX-NEXT:    pushl $0
+; LINUX-NEXT:    pushl $0
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    pushl %ecx
+; LINUX-NEXT:    pushl %eax
+; LINUX-NEXT:    calll eightparams64 at PLT
+; LINUX-NEXT:    addl $76, %esp
+; LINUX-NEXT:    retl
 entry:
   tail call void @eightparams64(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 0, i64 -1)
   ret void
diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll
index a663f6a1dd376..48d3926da364b 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i16.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll
@@ -814,8 +814,8 @@ define i16 @test_mul_spec(i16 %x) nounwind {
 ; X86-LABEL: test_mul_spec:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal 42(%eax,%eax,8), %ecx
-; X86-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NEXT:    leal 2(%eax,%eax,4), %ecx
+; X86-NEXT:    leal 42(%eax,%eax,8), %eax
 ; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index 4129b44ed3ddc..2e8355a2fc06b 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -1336,8 +1336,8 @@ define i32 @test_mul_spec(i32 %x) nounwind {
 ; X86-LABEL: test_mul_spec:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal 42(%eax,%eax,8), %ecx
-; X86-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NEXT:    leal 2(%eax,%eax,4), %ecx
+; X86-NEXT:    leal 42(%eax,%eax,8), %eax
 ; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -1362,8 +1362,8 @@ define i32 @test_mul_spec(i32 %x) nounwind {
 ; X86-NOOPT-LABEL: test_mul_spec:
 ; X86-NOOPT:       # %bb.0:
 ; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOOPT-NEXT:    leal 42(%eax,%eax,8), %ecx
-; X86-NOOPT-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NOOPT-NEXT:    leal 2(%eax,%eax,4), %ecx
+; X86-NOOPT-NEXT:    leal 42(%eax,%eax,8), %eax
 ; X86-NOOPT-NEXT:    imull %ecx, %eax
 ; X86-NOOPT-NEXT:    retl
 ;
@@ -1415,8 +1415,8 @@ define i32 @mul_neg_fold(i32 %a, i32 %b) {
 ; X86-LABEL: mul_neg_fold:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %ecx
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index 40d591f8d1be8..3ef9a2496b4f2 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -166,9 +166,9 @@ define i64 @test_mul_by_6(i64 %x) {
 define i64 @test_mul_by_7(i64 %x) {
 ; X86-LABEL: test_mul_by_7:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $7, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    leal (%edx,%ecx,8), %edx
 ; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    retl
@@ -437,9 +437,9 @@ define i64 @test_mul_by_14(i64 %x) {
 define i64 @test_mul_by_15(i64 %x) {
 ; X86-LABEL: test_mul_by_15:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $15, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
 ; X86-NEXT:    addl %ecx, %edx
@@ -815,9 +815,9 @@ define i64 @test_mul_by_24(i64 %x) {
 define i64 @test_mul_by_25(i64 %x) {
 ; X86-LABEL: test_mul_by_25:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $25, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
 ; X86-NEXT:    addl %ecx, %edx
@@ -900,9 +900,9 @@ define i64 @test_mul_by_26(i64 %x) {
 define i64 @test_mul_by_27(i64 %x) {
 ; X86-LABEL: test_mul_by_27:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $27, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
 ; X86-NEXT:    addl %ecx, %edx
@@ -1621,11 +1621,11 @@ define i64 @PR111325(i64 %a0, i1 %a1) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmpb $1, %cl
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
@@ -1634,11 +1634,11 @@ define i64 @PR111325(i64 %a0, i1 %a1) {
 ; X86-NOOPT-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOOPT-NEXT:    andb $1, %cl
 ; X86-NOOPT-NEXT:    xorl %eax, %eax
-; X86-NOOPT-NEXT:    xorl %edx, %edx
-; X86-NOOPT-NEXT:    subl {{[0-9]+}}(%esp), %edx
 ; X86-NOOPT-NEXT:    cmpb $1, %cl
-; X86-NOOPT-NEXT:    sbbl %eax, %eax
-; X86-NOOPT-NEXT:    orl %edx, %eax
+; X86-NOOPT-NEXT:    movl $0, %ecx
+; X86-NOOPT-NEXT:    sbbl %ecx, %ecx
+; X86-NOOPT-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    orl %ecx, %eax
 ; X86-NOOPT-NEXT:    xorl %edx, %edx
 ; X86-NOOPT-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index 1f9e7a93ad0b9..6009c9cb7f42f 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -9,183 +9,154 @@
 define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-LABEL: mult:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    movl $1, %ecx
 ; X86-NEXT:    jge .LBB0_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    je .LBB0_4
 ; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB0_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    decl %ecx
 ; X86-NEXT:    cmpl $31, %ecx
-; X86-NEXT:    ja .LBB0_35
+; X86-NEXT:    ja .LBB0_38
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    jmpl *.LJTI0_0(,%ecx,4)
 ; X86-NEXT:  .LBB0_6:
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_7:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_33:
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    jmp .LBB0_9
-; X86-NEXT:  .LBB0_8:
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_22:
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    jmp .LBB0_9
-; X86-NEXT:  .LBB0_10:
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_20:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:    jmp .LBB0_18
-; X86-NEXT:  .LBB0_11:
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_17:
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    jmp .LBB0_18
-; X86-NEXT:  .LBB0_13:
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_9:
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_18:
 ; X86-NEXT:    leal (%eax,%eax,2), %ecx
-; X86-NEXT:    jmp .LBB0_14
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_15:
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    jmp .LBB0_12
-; X86-NEXT:  .LBB0_16:
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_31:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
-; X86-NEXT:    jmp .LBB0_9
-; X86-NEXT:  .LBB0_17:
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_30:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:    jmp .LBB0_12
-; X86-NEXT:  .LBB0_19:
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_7:
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_21:
 ; X86-NEXT:    shll $4, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_20:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_8:
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_21:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_13:
 ; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_22:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_37:
 ; X86-NEXT:    shll $5, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_23:
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:  .LBB0_33:
 ; X86-NEXT:    leal (%eax,%eax,8), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_24:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_26:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
-; X86-NEXT:  .LBB0_14:
 ; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_25:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_10:
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    jmp .LBB0_18
-; X86-NEXT:  .LBB0_26:
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_27:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
 ; X86-NEXT:    leal (%eax,%ecx,4), %ecx
-; X86-NEXT:    jmp .LBB0_9
-; X86-NEXT:  .LBB0_27:
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_19:
 ; X86-NEXT:    leal (%eax,%eax), %ecx
 ; X86-NEXT:    shll $4, %eax
-; X86-NEXT:    jmp .LBB0_28
-; X86-NEXT:  .LBB0_29:
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_11:
 ; X86-NEXT:    leal (,%eax,8), %ecx
-; X86-NEXT:    jmp .LBB0_38
-; X86-NEXT:  .LBB0_30:
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_24:
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
-; X86-NEXT:    jmp .LBB0_32
-; X86-NEXT:  .LBB0_31:
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_16:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
-; X86-NEXT:  .LBB0_32:
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_34:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_14:
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_36:
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shll $5, %ecx
-; X86-NEXT:    jmp .LBB0_38
-; X86-NEXT:  .LBB0_35:
+; X86-NEXT:    jmp .LBB0_12
+; X86-NEXT:  .LBB0_38:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB0_36:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:  .LBB0_39:
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_37:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_28:
 ; X86-NEXT:    leal (%eax,%eax,2), %ecx
 ; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:  .LBB0_38:
+; X86-NEXT:  .LBB0_12:
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_39:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_25:
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:  .LBB0_12:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_40:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_29:
 ; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    jmp .LBB0_18
-; X86-NEXT:  .LBB0_41:
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_34:
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:  .LBB0_9:
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_42:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_35:
 ; X86-NEXT:    leal (%eax,%eax), %ecx
 ; X86-NEXT:    shll $5, %eax
-; X86-NEXT:  .LBB0_28:
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_43:
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .LBB0_32:
 ; X86-NEXT:    leal (%eax,%eax,8), %eax
-; X86-NEXT:  .LBB0_18:
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-HSW-LABEL: mult:
@@ -532,6 +503,8 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    subl $108, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 128
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -543,8 +516,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $1, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $1
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $2
@@ -552,9 +524,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $2, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $1
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $3
@@ -562,8 +532,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $3, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $2
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $4
@@ -571,10 +540,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $4, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $2
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $5
@@ -582,8 +548,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $5, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $3
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $6
@@ -591,9 +556,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $6, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $3
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $7
@@ -601,10 +564,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $7, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $8
@@ -612,8 +572,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $8, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $9
@@ -621,9 +580,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $9, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $10
@@ -631,9 +588,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $10, %ebx
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $11
@@ -641,10 +596,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $11, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $12
@@ -652,8 +604,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $12, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $13
@@ -661,9 +612,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $13, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $7
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $14
@@ -671,9 +620,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $14, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $7
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $15
@@ -681,9 +628,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    xorl $15, %ebp
-; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $16
@@ -691,10 +636,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $16, %edi
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $17
@@ -702,8 +644,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $17, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $18
@@ -711,9 +652,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $18, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $19
@@ -721,9 +660,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $19, %ebx
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $20
@@ -731,9 +668,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $20, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $21
@@ -741,9 +676,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $21, %ebx
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $22
@@ -751,10 +684,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $22, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $23
@@ -762,8 +692,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $23, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $24
@@ -771,9 +700,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $24, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $25
@@ -781,9 +708,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $25, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $26
@@ -791,9 +716,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $26, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $27
@@ -801,9 +724,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $27, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $28
@@ -811,9 +732,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    xorl $28, %ebp
-; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $29
@@ -821,10 +740,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $29, %edi
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $30
@@ -832,8 +748,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $30, %ebx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $31
@@ -841,10 +756,99 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $2, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $3, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $4, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $5, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $6, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $7, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $8, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $9, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $10, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $11, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $12, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $13, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $14, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $15, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $16, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $17, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $18, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $19, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl $20, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $21, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    xorl $22, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $23, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $24, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $25, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $26, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $27, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    xorl $28, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    xorl $29, %ebp
+; X86-NEXT:    orl %esi, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    xorl $30, %eax
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ebp, %edi
 ; X86-NEXT:    pushl $16
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $32
@@ -854,10 +858,12 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    xorl $32, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $108, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index bb93e34fda7c4..4476d38903468 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -9,109 +9,233 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $400, %esp # imm = 0x190
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 60(%eax), %ebp
-; X86-NEXT:    movl 56(%eax), %eax
+; X86-NEXT:    subl $416, %esp # imm = 0x1A0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 124(%esi), %edi
+; X86-NEXT:    movl (%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 112(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 116(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 120(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 108(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 104(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 100(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 76(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 84(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%esi), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    movl 16(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 124(%ecx), %edx
+; X86-NEXT:    movl 120(%ecx), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl 116(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 112(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 100(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 108(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 104(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 84(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 76(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 64(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl 32(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ecx), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, (%ecx)
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 48(%ecx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 52(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -152,104 +276,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 40(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 44(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 32(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 36(%ecx), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -257,169 +375,161 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 16(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 24(%eax), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 28(%eax), %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -431,7 +541,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
@@ -452,104 +562,105 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -561,33 +672,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -596,105 +707,101 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 24(%ecx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 28(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 16(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 20(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
@@ -707,13 +814,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -737,241 +844,231 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 8(%ecx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 12(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 4(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
 ; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
@@ -980,154 +1077,159 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 16(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1135,33 +1237,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    adcl $0, %edx
@@ -1174,10 +1276,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -1194,7 +1296,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
@@ -1207,11 +1309,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 32(%edi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1222,20 +1322,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 36(%eax), %esi
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
@@ -1246,130 +1343,265 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 40(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 44(%eax), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
@@ -1378,25 +1610,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -1406,117 +1638,121 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 52(%eax), %ebp
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1541,47 +1777,45 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 56(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl 60(%esi), %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %eax, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
@@ -1613,226 +1847,86 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 60(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1840,30 +1934,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1886,7 +1980,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
@@ -1896,31 +1989,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1930,9 +2024,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1962,7 +2056,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -1976,131 +2070,129 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -2109,93 +2201,95 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
@@ -2204,40 +2298,41 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2245,151 +2340,148 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2402,18 +2494,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2434,48 +2522,43 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 64(%eax), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 68(%eax), %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2504,48 +2587,44 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 72(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 76(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2575,105 +2654,103 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2681,147 +2758,139 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 80(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 84(%eax), %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 88(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 92(%eax), %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2834,181 +2903,178 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movzbl %bl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -3017,110 +3083,110 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
@@ -3131,77 +3197,76 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
@@ -3209,318 +3274,308 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    imull %ebx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 104(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 108(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 96(%esi), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 100(%esi), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 112(%esi), %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl 116(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl 120(%esi), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 124(%ecx), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    imull %ebp, %eax
 ; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -3533,9 +3588,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -3545,109 +3600,105 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 88(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 92(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 80(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 84(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
@@ -3656,285 +3707,279 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 72(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl 76(%ecx), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 64(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl 68(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3942,52 +3987,55 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
@@ -3996,91 +4044,91 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4109,8 +4157,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4129,37 +4177,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 96(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 100(%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4167,222 +4210,211 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 104(%eax), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 108(%eax), %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl 120(%ebx), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl 124(%ebx), %eax
-; X86-NEXT:    imull %ecx, %eax
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl 112(%ebx), %edi
-; X86-NEXT:    movl 116(%ebx), %ebp
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -4393,210 +4425,205 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    imull %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4606,7 +4633,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
@@ -4616,76 +4643,62 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4704,110 +4717,62 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, 64(%ecx)
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 68(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 16(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 24(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 72(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 28(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 76(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 32(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 80(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 36(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 84(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 40(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 88(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 44(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 92(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 48(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 96(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 52(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 100(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 56(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 104(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 60(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 64(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 68(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 72(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 76(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 80(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 84(%ecx)
-; X86-NEXT:    movl %ebp, 88(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 92(%ecx)
-; X86-NEXT:    movl %ebx, 96(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 100(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 104(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 108(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 108(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, 112(%ecx)
-; X86-NEXT:    movl %edi, 116(%ecx)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, 116(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, 120(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 124(%ecx)
-; X86-NEXT:    addl $400, %esp # imm = 0x190
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 124(%ecx)
+; X86-NEXT:    addl $416, %esp # imm = 0x1A0
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 2d7737bfdd3c2..7da11bf0a81c8 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -11,284 +11,290 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $72, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ebx
-; X86-NEXT:    movl 8(%eax), %ebp
-; X86-NEXT:    movl (%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl (%ebx), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ebx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%esi), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl 8(%esi), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 4(%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl 12(%esi), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 4(%esi), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 20(%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 20(%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 8(%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 8(%edi), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl 12(%edi), %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 28(%edx), %eax
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 28(%eax), %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl 24(%edx), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 16(%ecx), %esi
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl 20(%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 24(%ecx), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 28(%ecx), %ecx
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 24(%edi), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl 28(%edi), %eax
-; X86-NEXT:    imull %ecx, %eax
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl 16(%edi), %edi
-; X86-NEXT:    movl 20(%edx), %ebp
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, (%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, 4(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, 8(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, 12(%esi)
-; X86-NEXT:    movl %ecx, 16(%esi)
-; X86-NEXT:    movl %edi, 20(%esi)
+; X86-NEXT:    movl %ebp, 16(%esi)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 20(%esi)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, 24(%esi)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, 28(%esi)
-; X86-NEXT:    addl $72, %esp
+; X86-NEXT:    addl $80, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 2421aabdbcd99..b8a712f29e72d 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -9,124 +9,181 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $180, %esp
+; X86-NEXT:    subl $192, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 60(%esi), %edi
+; X86-NEXT:    movl (%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%esi), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 60(%ecx), %esi
+; X86-NEXT:    movl 48(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ecx), %ebp
+; X86-NEXT:    movl 24(%ecx), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 28(%eax), %ebx
-; X86-NEXT:    movl 24(%eax), %ebp
-; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 4(%eax), %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 16(%ecx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 20(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ecx
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
@@ -147,113 +204,105 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 8(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 12(%ecx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ebx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -261,7 +310,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
@@ -283,7 +332,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -291,75 +340,69 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 20(%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -367,35 +410,31 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 28(%eax), %ecx
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -406,57 +445,62 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -488,30 +532,30 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
@@ -527,68 +571,69 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -598,163 +643,154 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 36(%eax), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 40(%edi), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl 44(%edi), %edi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    imull %ebx, %eax
 ; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -766,182 +802,173 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    imull %ecx, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 56(%edi), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl 48(%edi), %esi
-; X86-NEXT:    movl 52(%edi), %edi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 40(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 44(%ecx), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 32(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 36(%esi), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
@@ -980,191 +1007,163 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 48(%esi), %edi
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    imull %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    imull %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 60(%esi), %esi
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 16(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 24(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 28(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 32(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, 36(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, 40(%ecx)
-; X86-NEXT:    movl %ebx, 44(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 44(%ecx)
+; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, 48(%ecx)
-; X86-NEXT:    movl %edi, 52(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 52(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, 56(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, 60(%ecx)
-; X86-NEXT:    addl $180, %esp
+; X86-NEXT:    addl $192, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index e10e48f9aea08..1d8e6fd4f57cc 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -30,62 +30,61 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    movl 44(%ebp), %esi
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %ebx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    imull %edi, %eax
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    imull 28(%ebp), %ecx
 ; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl 52(%ebp), %esi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl %eax, 4(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, 8(%esi)
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/musttail-inalloca.ll b/llvm/test/CodeGen/X86/musttail-inalloca.ll
index ab6159520d925..f936c59ad5694 100644
--- a/llvm/test/CodeGen/X86/musttail-inalloca.ll
+++ b/llvm/test/CodeGen/X86/musttail-inalloca.ll
@@ -20,7 +20,8 @@ define dso_local x86_thiscallcc void @methodWithVtorDisp_thunk(ptr %0, ptr inall
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl %ecx, %esi
 ; CHECK-NEXT:    subl -4(%ecx), %esi
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $_methodWithVtorDisp_thunk
 ; CHECK-NEXT:    calll ___cyg_profile_func_exit
 ; CHECK-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/musttail-struct.ll b/llvm/test/CodeGen/X86/musttail-struct.ll
index 735fd674a2ff1..44817344d4524 100644
--- a/llvm/test/CodeGen/X86/musttail-struct.ll
+++ b/llvm/test/CodeGen/X86/musttail-struct.ll
@@ -152,11 +152,11 @@ define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    sub esp, 20
 ; X32-NEXT:    .cfi_def_cfa_offset 24
-; X32-NEXT:    mov dword ptr [esp], 0
-; X32-NEXT:    mov dword ptr [esp + 4], 1
-; X32-NEXT:    mov dword ptr [esp + 8], 2
-; X32-NEXT:    mov dword ptr [esp + 12], 3
 ; X32-NEXT:    mov dword ptr [esp + 16], 4
+; X32-NEXT:    mov dword ptr [esp + 12], 3
+; X32-NEXT:    mov dword ptr [esp + 8], 2
+; X32-NEXT:    mov dword ptr [esp + 4], 1
+; X32-NEXT:    mov dword ptr [esp], 0
 ; X32-NEXT:    mov dword ptr [esp + 24], 0
 ; X32-NEXT:    mov dword ptr [esp + 28], 1
 ; X32-NEXT:    mov dword ptr [esp + 32], 2
diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll
index 65cd1edd92e31..a5d9fa8a4d9d9 100644
--- a/llvm/test/CodeGen/X86/musttail-varargs.ll
+++ b/llvm/test/CodeGen/X86/musttail-varargs.ll
@@ -246,9 +246,9 @@ define void @f_thunk(ptr %this, ...) {
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $32, %esp
-; X86-NOSSE-NEXT:    movl 8(%ebp), %esi
 ; X86-NOSSE-NEXT:    leal 12(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
+; X86-NOSSE-NEXT:    movl 8(%ebp), %esi
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    calll _get_f
 ; X86-NOSSE-NEXT:    addl $4, %esp
@@ -268,9 +268,9 @@ define void @f_thunk(ptr %this, ...) {
 ; X86-SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
-; X86-SSE-NEXT:    movl 8(%ebp), %esi
 ; X86-SSE-NEXT:    leal 12(%ebp), %eax
 ; X86-SSE-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movl 8(%ebp), %esi
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    calll _get_f
 ; X86-SSE-NEXT:    addl $4, %esp
@@ -375,8 +375,8 @@ define void @h_thunk(ptr %this, ...) {
 ; X86-NEXT:    jmpl *%ecx # TAILCALL
 ; X86-NEXT:  LBB2_2: # %else
 ; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl $42, _g
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $42, _g
 ; X86-NEXT:    jmpl *%ecx # TAILCALL
   %cond = load i1, ptr %this
   br i1 %cond, label %then, label %else
diff --git a/llvm/test/CodeGen/X86/musttail.ll b/llvm/test/CodeGen/X86/musttail.ll
index 9e02585a3ffdc..30b26b069bdec 100644
--- a/llvm/test/CodeGen/X86/musttail.ll
+++ b/llvm/test/CodeGen/X86/musttail.ll
@@ -1,11 +1,10 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=i686-- < %s | FileCheck %s
 ; RUN: llc -mtriple=i686-- -O0 < %s | FileCheck %s
 ; RUN: llc -mtriple=i686-- -disable-tail-calls < %s | FileCheck %s
 
 declare void @t1_callee(ptr)
 define void @t1(ptr %a) {
-; CHECK-LABEL: t1:
-; CHECK: jmp {{_?}}t1_callee
   musttail call void @t1_callee(ptr %a)
   ret void
 }
@@ -13,7 +12,8 @@ define void @t1(ptr %a) {
 declare ptr @t2_callee()
 define ptr @t2() {
 ; CHECK-LABEL: t2:
-; CHECK: jmp {{_?}}t2_callee
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp t2_callee at PLT # TAILCALL
   %v = musttail call ptr @t2_callee()
   ret ptr %v
 }
@@ -21,14 +21,6 @@ define ptr @t2() {
 ; Complex frame layout: stack realignment with dynamic alloca.
 define void @t3(i32 %n) alignstack(32) nounwind {
 entry:
-; CHECK: t3:
-; CHECK: pushl %ebp
-; CHECK: pushl %esi
-; CHECK: andl $-32, %esp
-; CHECK: movl %esp, %esi
-; CHECK: popl %esi
-; CHECK: popl %ebp
-; CHECK-NEXT: jmp {{_?}}t3_callee
   %a = alloca i8, i32 %n
   call void @capture(ptr %a)
   musttail call void @t3_callee(i32 %n) nounwind
@@ -41,12 +33,6 @@ declare void @t3_callee(i32)
 ; Test that we actually copy in and out stack arguments that aren't forwarded
 ; without modification.
 define i32 @t4(ptr %fn, i32 %n, i32 %r) {
-; CHECK-LABEL: t4:
-; CHECK: incl %[[r:.*]]
-; CHECK: decl %[[n:.*]]
-; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%esp)
-; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%esp)
-; CHECK: jmpl *%{{.*}}
 
 entry:
   %r1 = add i32 %r, 1
@@ -57,24 +43,10 @@ entry:
 
 ; Combine the complex stack frame with the parameter modification.
 define i32 @t5(ptr %fn, i32 %n, i32 %r) alignstack(32) {
-; CHECK-LABEL: t5:
-; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
-; CHECK: pushl %esi
 ; 	Align the stack.
-; CHECK: andl $-32, %esp
-; CHECK: movl %esp, %esi
 ; 	Modify the args.
-; CHECK: incl %[[r:.*]]
-; CHECK: decl %[[n:.*]]
 ; 	Store them through ebp, since that's the only stable arg pointer.
-; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%ebp)
-; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%ebp)
 ; 	Epilogue.
-; CHECK: leal {{[-0-9]+}}(%ebp), %esp
-; CHECK: popl %esi
-; CHECK: popl %ebp
-; CHECK: jmpl *%{{.*}}
 
 entry:
   %a = alloca i8, i32 %n
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 050610426e6bd..f3d5989ad6818 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -111,28 +111,28 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    subl %edi, %ebx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    subl %eax, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -230,22 +230,20 @@ define i32 @sub_abs_i32(i32 %x, i32 %y) nounwind {
 define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: sub_abs_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    subl %esi, %edi
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    subl %edi, %eax
+; X86-NEXT:    subl %esi, %eax
 ; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sub_abs_i64:
@@ -265,39 +263,42 @@ define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl 28(%ebp), %esi
-; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    subl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl 40(%ebp), %edx
-; X86-NEXT:    subl %edi, %edx
-; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
 ; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll
index 18ded50b45037..0ca1e03fbd98b 100644
--- a/llvm/test/CodeGen/X86/neg_fp.ll
+++ b/llvm/test/CodeGen/X86/neg_fp.ll
@@ -32,13 +32,12 @@ define double @negation_propagation(ptr %arg, double %arg1, double %arg2) nounwi
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd 12(%ebp), %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    mulsd %xmm0, %xmm1
-; CHECK-NEXT:    movapd %xmm0, %xmm2
-; CHECK-NEXT:    mulsd %xmm0, %xmm2
-; CHECK-NEXT:    mulsd %xmm0, %xmm2
-; CHECK-NEXT:    subsd %xmm2, %xmm1
-; CHECK-NEXT:    movsd %xmm1, (%esp)
+; CHECK-NEXT:    mulsd %xmm0, %xmm1
+; CHECK-NEXT:    mulsd 20(%ebp), %xmm0
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    movsd %xmm0, (%esp)
 ; CHECK-NEXT:    fldl (%esp)
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
@@ -63,13 +62,13 @@ define float @fdiv_extra_use_changes_cost(float %a0, float %a1, float %a2) nounw
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    subss {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    movaps %xmm1, %xmm2
-; CHECK-NEXT:    mulss %xmm0, %xmm2
-; CHECK-NEXT:    subss %xmm1, %xmm0
-; CHECK-NEXT:    divss %xmm2, %xmm0
-; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    movaps %xmm0, %xmm2
+; CHECK-NEXT:    mulss %xmm1, %xmm2
+; CHECK-NEXT:    subss %xmm0, %xmm1
+; CHECK-NEXT:    divss %xmm2, %xmm1
+; CHECK-NEXT:    movss %xmm1, (%esp)
 ; CHECK-NEXT:    flds (%esp)
 ; CHECK-NEXT:    popl %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/negate-add-zero.ll b/llvm/test/CodeGen/X86/negate-add-zero.ll
index 4884832a91a52..42b2102abf47c 100644
--- a/llvm/test/CodeGen/X86/negate-add-zero.ll
+++ b/llvm/test/CodeGen/X86/negate-add-zero.ll
@@ -828,45 +828,39 @@ define linkonce void @_ZN21HNodeTranslateRotate36setVelERK9CDSVectorIdLi1EN3CDS1
 ; CHECK-LABEL: _ZN21HNodeTranslateRotate36setVelERK9CDSVectorIdLi1EN3CDS12DefaultAllocEE:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    fldl 0
-; CHECK-NEXT:    fldl 3184(%ecx)
-; CHECK-NEXT:    fld %st(1)
-; CHECK-NEXT:    fmull 3176(%ecx)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fmull 3176(%eax)
+; CHECK-NEXT:    fldl 3184(%eax)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fmull -8
 ; CHECK-NEXT:    fldz
-; CHECK-NEXT:    fld %st(1)
-; CHECK-NEXT:    fadd %st(1), %st
-; CHECK-NEXT:    fld %st(3)
+; CHECK-NEXT:    fsub %st, %st(1)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fsub %st(3), %st
+; CHECK-NEXT:    fld %st(4)
 ; CHECK-NEXT:    fmul %st(5), %st
-; CHECK-NEXT:    fadd %st(2), %st
-; CHECK-NEXT:    fxch %st(5)
-; CHECK-NEXT:    fmul %st, %st(0)
-; CHECK-NEXT:    fadd %st, %st(5)
-; CHECK-NEXT:    fsubr %st, %st(5)
+; CHECK-NEXT:    fadd %st, %st(1)
+; CHECK-NEXT:    fadd %st, %st(1)
+; CHECK-NEXT:    fxch %st(3)
+; CHECK-NEXT:    fmulp %st, %st(5)
 ; CHECK-NEXT:    fxch %st(4)
-; CHECK-NEXT:    fmull -8
-; CHECK-NEXT:    fxch %st(5)
-; CHECK-NEXT:    fstl 8
-; CHECK-NEXT:    fxch %st(2)
-; CHECK-NEXT:    fsubp %st, %st(5)
-; CHECK-NEXT:    fxch %st(4)
-; CHECK-NEXT:    fsubp %st, %st(2)
-; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fadd %st(1), %st
 ; CHECK-NEXT:    fadd %st(2), %st
-; CHECK-NEXT:    faddp %st, %st(2)
+; CHECK-NEXT:    fsubp %st, %st(2)
 ; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fstl 8
+; CHECK-NEXT:    fxch %st(3)
 ; CHECK-NEXT:    fstl 16
-; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:    fxch %st(3)
 ; CHECK-NEXT:    fadd %st, %st(0)
-; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fstpl 2064(%eax)
+; CHECK-NEXT:    faddp %st, %st(1)
 ; CHECK-NEXT:    fadd %st, %st(0)
-; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:    fstpl 2056(%eax)
 ; CHECK-NEXT:    fadd %st, %st(0)
-; CHECK-NEXT:    fxch %st(1)
-; CHECK-NEXT:    fstpl 2056(%ecx)
-; CHECK-NEXT:    fxch %st(1)
-; CHECK-NEXT:    fstpl 2064(%ecx)
-; CHECK-NEXT:    fstpl 2072(%ecx)
+; CHECK-NEXT:    fstpl 2072(%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, 0
 ; CHECK-NEXT:    movl $4, 4
 ; CHECK-NEXT:    movl $3, 8
diff --git a/llvm/test/CodeGen/X86/no-ret-in-x87-reg.ll b/llvm/test/CodeGen/X86/no-ret-in-x87-reg.ll
index 19554e652e4cc..08e8605011ebb 100644
--- a/llvm/test/CodeGen/X86/no-ret-in-x87-reg.ll
+++ b/llvm/test/CodeGen/X86/no-ret-in-x87-reg.ll
@@ -64,8 +64,10 @@ define float @f4(float %a, float %b) nounwind {
 ;
 ; NOSSE-NOX87-LABEL: f4:
 ; NOSSE-NOX87:       # %bb.0: # %entry
-; NOSSE-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOSSE-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; NOSSE-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NOX87-NEXT:    pushl %eax
+; NOSSE-NOX87-NEXT:    pushl %ecx
 ; NOSSE-NOX87-NEXT:    calll __addsf3
 ; NOSSE-NOX87-NEXT:    addl $8, %esp
 ; NOSSE-NOX87-NEXT:    retl
@@ -90,12 +92,18 @@ define double @f5(double %a, double %b) nounwind {
 ;
 ; NOSSE-NOX87-LABEL: f5:
 ; NOSSE-NOX87:       # %bb.0: # %entry
-; NOSSE-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOSSE-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOSSE-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOSSE-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; NOSSE-NOX87-NEXT:    pushl %esi
+; NOSSE-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOSSE-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOSSE-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOSSE-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NOX87-NEXT:    pushl %ecx
+; NOSSE-NOX87-NEXT:    pushl %eax
+; NOSSE-NOX87-NEXT:    pushl %esi
+; NOSSE-NOX87-NEXT:    pushl %edx
 ; NOSSE-NOX87-NEXT:    calll __adddf3
 ; NOSSE-NOX87-NEXT:    addl $16, %esp
+; NOSSE-NOX87-NEXT:    popl %esi
 ; NOSSE-NOX87-NEXT:    retl
 ;
 ; SSE-NOX87-LABEL: f5:
@@ -127,17 +135,27 @@ define x86_fp80 @f6(x86_fp80 %a, x86_fp80 %b) nounwind {
 ;
 ; NOX87-LABEL: f6:
 ; NOX87:       # %bb.0: # %entry
+; NOX87-NEXT:    pushl %ebx
+; NOX87-NEXT:    pushl %edi
+; NOX87-NEXT:    pushl %esi
 ; NOX87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; NOX87-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; NOX87-NEXT:    pushl %ecx
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOX87-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; NOX87-NEXT:    pushl %eax
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; NOX87-NEXT:    pushl %edx
+; NOX87-NEXT:    pushl %ecx
+; NOX87-NEXT:    pushl %esi
+; NOX87-NEXT:    pushl %ebx
+; NOX87-NEXT:    pushl %edi
 ; NOX87-NEXT:    calll __addxf3
 ; NOX87-NEXT:    addl $24, %esp
 ; NOX87-NEXT:    movzwl %cx, %ecx
+; NOX87-NEXT:    popl %esi
+; NOX87-NEXT:    popl %edi
+; NOX87-NEXT:    popl %ebx
 ; NOX87-NEXT:    retl
 entry:
   %0 = fadd x86_fp80 %a, %b
@@ -149,8 +167,8 @@ define {float, float, float} @f7(float %a, float %b) nounwind {
 ; X87:       # %bb.0: # %entry
 ; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
-; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fstps 8(%eax)
+; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fsts 4(%eax)
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    retl $4
@@ -185,8 +203,10 @@ define x86_fp80 @f8(i64 %a) nounwind {
 ;
 ; NOX87-LABEL: f8:
 ; NOX87:       # %bb.0: # %entry
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOX87-NEXT:    pushl %ecx
+; NOX87-NEXT:    pushl %eax
 ; NOX87-NEXT:    calll __floatdixf
 ; NOX87-NEXT:    addl $8, %esp
 ; NOX87-NEXT:    movzwl %cx, %ecx
@@ -215,9 +235,11 @@ define i32 @f9(x86_fp80 %a) nounwind {
 ; NOX87-LABEL: f9:
 ; NOX87:       # %bb.0: # %entry
 ; NOX87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; NOX87-NEXT:    pushl %eax
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; NOX87-NEXT:    pushl %edx
+; NOX87-NEXT:    pushl %ecx
 ; NOX87-NEXT:    calll __fixxfsi
 ; NOX87-NEXT:    addl $12, %esp
 ; NOX87-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/nomovtopush.ll b/llvm/test/CodeGen/X86/nomovtopush.ll
index ac21ae5d66cb4..d9a92b8e67f82 100644
--- a/llvm/test/CodeGen/X86/nomovtopush.ll
+++ b/llvm/test/CodeGen/X86/nomovtopush.ll
@@ -20,14 +20,14 @@ define dso_local i32 @test() local_unnamed_addr {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl $16396, %eax # imm = 0x400C
 ; CHECK-NEXT:    calll __chkstk
-; CHECK-NEXT:    movl _g_d, %eax
-; CHECK-NEXT:    movl _g_c, %ecx
-; CHECK-NEXT:    movl _g_b, %edx
-; CHECK-NEXT:    movl _g_a, %esi
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl _g_a, %eax
+; CHECK-NEXT:    movl _g_b, %ecx
+; CHECK-NEXT:    movl _g_c, %edx
+; CHECK-NEXT:    movl _g_d, %esi
+; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %esi, (%esp)
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    calll _bar
 ; CHECK-NEXT:    movl $4099, %ecx # imm = 0x1003
 ; CHECK-NEXT:    movl %esp, %edi
diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll
index 3b6ffacb0b230..a8d99d4f88877 100644
--- a/llvm/test/CodeGen/X86/nontemporal.ll
+++ b/llvm/test/CodeGen/X86/nontemporal.ll
@@ -12,36 +12,36 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
-; X86-SSE-NEXT:    movl 12(%ebp), %ecx
-; X86-SSE-NEXT:    movdqa 56(%ebp), %xmm4
-; X86-SSE-NEXT:    movdqa 40(%ebp), %xmm5
-; X86-SSE-NEXT:    movdqa 24(%ebp), %xmm6
-; X86-SSE-NEXT:    movl 8(%ebp), %esi
-; X86-SSE-NEXT:    movl 80(%ebp), %edx
-; X86-SSE-NEXT:    movl (%edx), %eax
+; X86-SSE-NEXT:    movl 80(%ebp), %ecx
+; X86-SSE-NEXT:    movl (%ecx), %eax
 ; X86-SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT:    movntps %xmm0, (%esi)
+; X86-SSE-NEXT:    movl 8(%ebp), %edx
+; X86-SSE-NEXT:    movntps %xmm0, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
 ; X86-SSE-NEXT:    paddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movntdq %xmm2, (%esi)
+; X86-SSE-NEXT:    movntdq %xmm2, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
 ; X86-SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movntpd %xmm1, (%esi)
-; X86-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movntdq %xmm6, (%esi)
-; X86-SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movntdq %xmm5, (%esi)
-; X86-SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movntdq %xmm4, (%esi)
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movntil %ecx, (%esi)
-; X86-SSE-NEXT:    addl (%edx), %eax
-; X86-SSE-NEXT:    movsd %xmm3, (%esi)
-; X86-SSE-NEXT:    addl (%edx), %eax
+; X86-SSE-NEXT:    movntpd %xmm1, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
+; X86-SSE-NEXT:    movdqa 24(%ebp), %xmm0
+; X86-SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movntdq %xmm0, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
+; X86-SSE-NEXT:    movdqa 40(%ebp), %xmm0
+; X86-SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movntdq %xmm0, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
+; X86-SSE-NEXT:    movdqa 56(%ebp), %xmm0
+; X86-SSE-NEXT:    paddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movntdq %xmm0, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
+; X86-SSE-NEXT:    movl 12(%ebp), %esi
+; X86-SSE-NEXT:    movntil %esi, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd %xmm0, (%edx)
+; X86-SSE-NEXT:    addl (%ecx), %eax
 ; X86-SSE-NEXT:    leal -4(%ebp), %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %ebp
@@ -54,36 +54,36 @@ define i32 @f(<4 x float> %A, ptr %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    andl $-16, %esp
 ; X86-AVX-NEXT:    subl $16, %esp
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
-; X86-AVX-NEXT:    movl 12(%ebp), %ecx
-; X86-AVX-NEXT:    vmovdqa 56(%ebp), %xmm4
-; X86-AVX-NEXT:    vmovdqa 40(%ebp), %xmm5
-; X86-AVX-NEXT:    vmovdqa 24(%ebp), %xmm6
-; X86-AVX-NEXT:    movl 8(%ebp), %esi
-; X86-AVX-NEXT:    movl 80(%ebp), %edx
-; X86-AVX-NEXT:    movl (%edx), %eax
+; X86-AVX-NEXT:    movl 80(%ebp), %ecx
+; X86-AVX-NEXT:    movl (%ecx), %eax
 ; X86-AVX-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovntps %xmm0, (%esi)
+; X86-AVX-NEXT:    movl 8(%ebp), %edx
+; X86-AVX-NEXT:    vmovntps %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
 ; X86-AVX-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm0
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
 ; X86-AVX-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm0
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    vmovntpd %xmm0, (%esi)
-; X86-AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm6, %xmm0
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X86-AVX-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5, %xmm0
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X86-AVX-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4, %xmm0
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    movntil %ecx, (%esi)
-; X86-AVX-NEXT:    addl (%edx), %eax
-; X86-AVX-NEXT:    vmovsd %xmm3, (%esi)
-; X86-AVX-NEXT:    addl (%edx), %eax
+; X86-AVX-NEXT:    vmovntpd %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
+; X86-AVX-NEXT:    vmovdqa 24(%ebp), %xmm0
+; X86-AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
+; X86-AVX-NEXT:    vmovdqa 40(%ebp), %xmm0
+; X86-AVX-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
+; X86-AVX-NEXT:    vmovdqa 56(%ebp), %xmm0
+; X86-AVX-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
+; X86-AVX-NEXT:    movl 12(%ebp), %esi
+; X86-AVX-NEXT:    movntil %esi, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovsd %xmm0, (%edx)
+; X86-AVX-NEXT:    addl (%ecx), %eax
 ; X86-AVX-NEXT:    leal -4(%ebp), %esp
 ; X86-AVX-NEXT:    popl %esi
 ; X86-AVX-NEXT:    popl %ebp
@@ -180,9 +180,9 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
 ; X86-LABEL: test_mmx:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movq (%eax), %mm0
 ; X86-NEXT:    psrlq $3, %mm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movntq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/nosse-vector.ll b/llvm/test/CodeGen/X86/nosse-vector.ll
index 9807d1b09d8ef..beb34bed554ce 100644
--- a/llvm/test/CodeGen/X86/nosse-vector.ll
+++ b/llvm/test/CodeGen/X86/nosse-vector.ll
@@ -6,15 +6,14 @@ define void @fadd_2f64_mem(ptr %p0, ptr %p1, ptr %p2) nounwind {
 ; X32-LABEL: fadd_2f64_mem:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    fldl 8(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    faddl 8(%ecx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    fldl 8(%edx)
-; X32-NEXT:    fldl (%edx)
+; X32-NEXT:    fstpl 8(%edx)
+; X32-NEXT:    fldl (%eax)
 ; X32-NEXT:    faddl (%ecx)
-; X32-NEXT:    fxch %st(1)
-; X32-NEXT:    faddl 8(%ecx)
-; X32-NEXT:    fstpl 8(%eax)
-; X32-NEXT:    fstpl (%eax)
+; X32-NEXT:    fstpl (%edx)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: fadd_2f64_mem:
@@ -38,24 +37,20 @@ define void @fadd_4f32_mem(ptr %p0, ptr %p1, ptr %p2) nounwind {
 ; X32-LABEL: fadd_4f32_mem:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    flds 12(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    fadds 12(%ecx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    flds 12(%edx)
-; X32-NEXT:    flds 8(%edx)
-; X32-NEXT:    flds 4(%edx)
-; X32-NEXT:    flds (%edx)
-; X32-NEXT:    fadds (%ecx)
-; X32-NEXT:    fxch %st(1)
-; X32-NEXT:    fadds 4(%ecx)
-; X32-NEXT:    fxch %st(2)
+; X32-NEXT:    fstps 12(%edx)
+; X32-NEXT:    flds 8(%eax)
 ; X32-NEXT:    fadds 8(%ecx)
-; X32-NEXT:    fxch %st(3)
-; X32-NEXT:    fadds 12(%ecx)
-; X32-NEXT:    fstps 12(%eax)
-; X32-NEXT:    fxch %st(2)
-; X32-NEXT:    fstps 8(%eax)
-; X32-NEXT:    fstps 4(%eax)
-; X32-NEXT:    fstps (%eax)
+; X32-NEXT:    fstps 8(%edx)
+; X32-NEXT:    flds 4(%eax)
+; X32-NEXT:    fadds 4(%ecx)
+; X32-NEXT:    fstps 4(%edx)
+; X32-NEXT:    flds (%eax)
+; X32-NEXT:    fadds (%ecx)
+; X32-NEXT:    fstps (%edx)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: fadd_4f32_mem:
@@ -88,24 +83,20 @@ define void @fdiv_4f32_mem(ptr %p0, ptr %p1, ptr %p2) nounwind {
 ; X32-LABEL: fdiv_4f32_mem:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    flds 12(%eax)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    fdivs 12(%ecx)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    flds 12(%edx)
-; X32-NEXT:    flds 8(%edx)
-; X32-NEXT:    flds 4(%edx)
-; X32-NEXT:    flds (%edx)
-; X32-NEXT:    fdivs (%ecx)
-; X32-NEXT:    fxch %st(1)
-; X32-NEXT:    fdivs 4(%ecx)
-; X32-NEXT:    fxch %st(2)
+; X32-NEXT:    fstps 12(%edx)
+; X32-NEXT:    flds 8(%eax)
 ; X32-NEXT:    fdivs 8(%ecx)
-; X32-NEXT:    fxch %st(3)
-; X32-NEXT:    fdivs 12(%ecx)
-; X32-NEXT:    fstps 12(%eax)
-; X32-NEXT:    fxch %st(2)
-; X32-NEXT:    fstps 8(%eax)
-; X32-NEXT:    fstps 4(%eax)
-; X32-NEXT:    fstps (%eax)
+; X32-NEXT:    fstps 8(%edx)
+; X32-NEXT:    flds 4(%eax)
+; X32-NEXT:    fdivs 4(%ecx)
+; X32-NEXT:    fstps 4(%edx)
+; X32-NEXT:    flds (%eax)
+; X32-NEXT:    fdivs (%ecx)
+; X32-NEXT:    fstps (%edx)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: fdiv_4f32_mem:
@@ -144,36 +135,34 @@ define void @sitofp_4i64_4f32_mem(ptr %p0, ptr %p1) nounwind {
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    andl $-8, %esp
 ; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 8(%ebp), %edx
-; X32-NEXT:    movl 24(%edx), %eax
+; X32-NEXT:    movl 8(%ebp), %esi
+; X32-NEXT:    movl 28(%esi), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 24(%esi), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl (%esi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 28(%edx), %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 16(%edx), %esi
-; X32-NEXT:    movl 20(%edx), %edi
-; X32-NEXT:    movl 8(%edx), %ebx
-; X32-NEXT:    movl 12(%edx), %ecx
-; X32-NEXT:    movl (%edx), %eax
-; X32-NEXT:    movl 4(%edx), %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 4(%esi), %ecx
+; X32-NEXT:    movl 8(%esi), %edx
+; X32-NEXT:    movl 12(%esi), %edi
+; X32-NEXT:    movl 16(%esi), %ebx
+; X32-NEXT:    movl 20(%esi), %eax
+; X32-NEXT:    fildll {{[0-9]+}}(%esp)
+; X32-NEXT:    movl 12(%ebp), %esi
+; X32-NEXT:    fstps 12(%esi)
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps 8(%esi)
 ; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    fildll {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps 4(%esi)
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl 12(%ebp), %eax
-; X32-NEXT:    fildll {{[0-9]+}}(%esp)
-; X32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildll {{[0-9]+}}(%esp)
-; X32-NEXT:    fildll {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps 12(%eax)
-; X32-NEXT:    fstps 8(%eax)
-; X32-NEXT:    fstps 4(%eax)
-; X32-NEXT:    fstps (%eax)
+; X32-NEXT:    fstps (%esi)
 ; X32-NEXT:    leal -12(%ebp), %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
@@ -209,30 +198,26 @@ define void @sitofp_4i64_4f32_mem(ptr %p0, ptr %p1) nounwind {
 define void @sitofp_4i32_4f32_mem(ptr %p0, ptr %p1) nounwind {
 ; X32-LABEL: sitofp_4i32_4f32_mem:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    pushl %esi
 ; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 12(%ecx), %edx
-; X32-NEXT:    movl 8(%ecx), %esi
-; X32-NEXT:    movl (%ecx), %edi
-; X32-NEXT:    movl 4(%ecx), %ecx
-; X32-NEXT:    movl %edi, (%esp)
+; X32-NEXT:    movl 12(%eax), %ecx
 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    fildl (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    fildl {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps 12(%ecx)
+; X32-NEXT:    movl 8(%eax), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildl {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps 8(%ecx)
+; X32-NEXT:    movl (%eax), %edx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fildl {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps 12(%eax)
-; X32-NEXT:    fstps 8(%eax)
-; X32-NEXT:    fstps 4(%eax)
-; X32-NEXT:    fstps (%eax)
+; X32-NEXT:    fstps 4(%ecx)
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    fildl (%esp)
+; X32-NEXT:    fstps (%ecx)
 ; X32-NEXT:    addl $16, %esp
-; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_4i32_4f32_mem:
@@ -263,27 +248,25 @@ define void @sitofp_4i32_4f32_mem(ptr %p0, ptr %p1) nounwind {
 define void @add_2i64_mem(ptr %p0, ptr %p1, ptr %p2) nounwind {
 ; X32-LABEL: add_2i64_mem:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl 12(%edx), %esi
-; X32-NEXT:    movl 8(%edx), %edi
-; X32-NEXT:    movl (%edx), %ebx
-; X32-NEXT:    movl 4(%edx), %edx
-; X32-NEXT:    addl (%ecx), %ebx
-; X32-NEXT:    adcl 4(%ecx), %edx
-; X32-NEXT:    addl 8(%ecx), %edi
-; X32-NEXT:    adcl 12(%ecx), %esi
-; X32-NEXT:    movl %edi, 8(%eax)
-; X32-NEXT:    movl %ebx, (%eax)
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl %edx, 4(%eax)
+; X32-NEXT:    movl 8(%ecx), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    addl 8(%esi), %edx
+; X32-NEXT:    movl %edx, 8(%eax)
+; X32-NEXT:    movl 12(%ecx), %edx
+; X32-NEXT:    adcl 12(%esi), %edx
+; X32-NEXT:    movl (%ecx), %edi
+; X32-NEXT:    movl 4(%ecx), %ecx
+; X32-NEXT:    addl (%esi), %edi
+; X32-NEXT:    movl %edi, (%eax)
+; X32-NEXT:    movl %edx, 12(%eax)
+; X32-NEXT:    adcl 4(%esi), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: add_2i64_mem:
@@ -305,27 +288,23 @@ define void @add_2i64_mem(ptr %p0, ptr %p1, ptr %p2) nounwind {
 define void @add_4i32_mem(ptr %p0, ptr %p1, ptr %p2) nounwind {
 ; X32-LABEL: add_4i32_mem:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl 12(%edx), %esi
-; X32-NEXT:    movl 8(%edx), %edi
-; X32-NEXT:    movl (%edx), %ebx
-; X32-NEXT:    movl 4(%edx), %edx
-; X32-NEXT:    addl (%ecx), %ebx
-; X32-NEXT:    addl 4(%ecx), %edx
-; X32-NEXT:    addl 8(%ecx), %edi
-; X32-NEXT:    addl 12(%ecx), %esi
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl %edi, 8(%eax)
-; X32-NEXT:    movl %edx, 4(%eax)
-; X32-NEXT:    movl %ebx, (%eax)
+; X32-NEXT:    movl 12(%ecx), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl 12(%eax), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %edx, 12(%esi)
+; X32-NEXT:    movl 8(%ecx), %edx
+; X32-NEXT:    addl 8(%eax), %edx
+; X32-NEXT:    movl %edx, 8(%esi)
+; X32-NEXT:    movl (%ecx), %edx
+; X32-NEXT:    movl 4(%ecx), %ecx
+; X32-NEXT:    addl 4(%eax), %ecx
+; X32-NEXT:    movl %ecx, 4(%esi)
+; X32-NEXT:    addl (%eax), %edx
+; X32-NEXT:    movl %edx, (%esi)
 ; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
-; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: add_4i32_mem:
diff --git a/llvm/test/CodeGen/X86/not-of-dec.ll b/llvm/test/CodeGen/X86/not-of-dec.ll
index 29461619ac805..b4a50df9c4b9c 100644
--- a/llvm/test/CodeGen/X86/not-of-dec.ll
+++ b/llvm/test/CodeGen/X86/not-of-dec.ll
@@ -55,9 +55,9 @@ define i64 @t1_64(i64 %alignment) nounwind {
 define i32 @t2_extrause(i32 %alignment, ptr %mask_storage) nounwind {
 ; X86-LABEL: t2_extrause:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    decl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/not-shift.ll b/llvm/test/CodeGen/X86/not-shift.ll
index 840210cddbba1..4dae056b8e94e 100644
--- a/llvm/test/CodeGen/X86/not-shift.ll
+++ b/llvm/test/CodeGen/X86/not-shift.ll
@@ -12,10 +12,10 @@ define i64 @sub63_shiftl64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: sub63_shiftl64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movb $63, %cl
 ; X86-NOBMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -30,10 +30,10 @@ define i64 @sub63_shiftl64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: sub63_shiftl64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movb $63, %cl
 ; X86-BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -67,10 +67,10 @@ define i64 @xor63_shiftr64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor63_shiftr64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    xorb $63, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movl %esi, %edx
 ; X86-NOBMI2-NEXT:    shrl %cl, %edx
 ; X86-NOBMI2-NEXT:    shrdl %cl, %esi, %eax
@@ -85,10 +85,10 @@ define i64 @xor63_shiftr64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: xor63_shiftr64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    xorb $63, %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -122,10 +122,10 @@ define i64 @sub127_shiftl64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: sub127_shiftl64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    xorb $127, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -140,10 +140,10 @@ define i64 @sub127_shiftl64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: sub127_shiftl64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    xorb $127, %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -177,10 +177,10 @@ define i64 @xor127_shiftr64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor127_shiftr64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    xorb $127, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movl %esi, %edx
 ; X86-NOBMI2-NEXT:    shrl %cl, %edx
 ; X86-NOBMI2-NEXT:    shrdl %cl, %esi, %eax
@@ -195,10 +195,10 @@ define i64 @xor127_shiftr64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: xor127_shiftr64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    xorb $127, %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -232,10 +232,10 @@ define i64 @xor64_shiftl64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor64_shiftl64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    xorb $64, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -250,10 +250,10 @@ define i64 @xor64_shiftl64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: xor64_shiftl64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    xorb $64, %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -285,10 +285,10 @@ define i64 @sub1s_shiftr64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: sub1s_shiftr64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI2-NEXT:    movl %esi, %edx
 ; X86-NOBMI2-NEXT:    shrl %cl, %edx
 ; X86-NOBMI2-NEXT:    shrdl %cl, %esi, %eax
@@ -303,10 +303,10 @@ define i64 @sub1s_shiftr64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: sub1s_shiftr64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    notb %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -340,10 +340,10 @@ define i64 @xor1s_shiftl64(i64 %val, i64 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor1s_shiftl64:
 ; X86-NOBMI2:       # %bb.0:
 ; X86-NOBMI2-NEXT:    pushl %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI2-NEXT:    movl %esi, %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    shldl %cl, %esi, %edx
@@ -358,10 +358,10 @@ define i64 @xor1s_shiftl64(i64 %val, i64 %cnt) nounwind {
 ;
 ; X86-BMI2-LABEL: xor1s_shiftl64:
 ; X86-BMI2:       # %bb.0:
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    notb %cl
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI2-NEXT:    shlxl %ecx, %eax, %eax
 ; X86-BMI2-NEXT:    testb $32, %cl
@@ -394,9 +394,9 @@ define i64 @xor1s_shiftl64(i64 %val, i64 %cnt) nounwind {
 define i32 @sub31_shiftr32(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: sub31_shiftr32:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -429,9 +429,9 @@ define i32 @sub31_shiftr32(i32 %val, i32 %cnt) nounwind {
 define i32 @xor31_shiftl32(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor31_shiftl32:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -464,9 +464,9 @@ define i32 @xor31_shiftl32(i32 %val, i32 %cnt) nounwind {
 define i32 @sub63_shiftr32(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: sub63_shiftr32:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -499,9 +499,9 @@ define i32 @sub63_shiftr32(i32 %val, i32 %cnt) nounwind {
 define i32 @xor63_shiftl32(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor63_shiftl32:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -565,9 +565,9 @@ define i32 @xor32_shiftr32(i32 %val, i32 %cnt) nounwind {
 define i32 @sub1s_shiftl32(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: sub1s_shiftl32:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -600,9 +600,9 @@ define i32 @sub1s_shiftl32(i32 %val, i32 %cnt) nounwind {
 define i32 @xor1s_shiftr32(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: xor1s_shiftr32:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    notb %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shrl %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -634,9 +634,9 @@ define i32 @xor1s_shiftr32(i32 %val, i32 %cnt) nounwind {
 define i32 @invalid_sub31(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: invalid_sub31:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    addb $-31, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
@@ -669,9 +669,9 @@ define i32 @invalid_sub31(i32 %val, i32 %cnt) nounwind {
 define i32 @invalid_add31(i32 %val, i32 %cnt) nounwind {
 ; X86-NOBMI2-LABEL: invalid_add31:
 ; X86-NOBMI2:       # %bb.0:
-; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI2-NEXT:    addb $31, %cl
+; X86-NOBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI2-NEXT:    shll %cl, %eax
 ; X86-NOBMI2-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll
index b6af7e1641a9c..c94ac2abedf19 100644
--- a/llvm/test/CodeGen/X86/optimize-max-0.ll
+++ b/llvm/test/CodeGen/X86/optimize-max-0.ll
@@ -15,138 +15,138 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $28, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %ebp
+; CHECK-NEXT:    imull %esi, %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    imull %esi, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; CHECK-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.1: ## %bb10.preheader
-; CHECK-NEXT:    movl %eax, %ebp
-; CHECK-NEXT:    sarl $31, %ebp
-; CHECK-NEXT:    shrl $30, %ebp
-; CHECK-NEXT:    addl %eax, %ebp
-; CHECK-NEXT:    sarl $2, %ebp
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    movl %ebp, %edi
+; CHECK-NEXT:    sarl $31, %edi
+; CHECK-NEXT:    shrl $30, %edi
+; CHECK-NEXT:    addl %ebp, %edi
+; CHECK-NEXT:    sarl $2, %edi
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.2: ## %bb.nph9
 ; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    jle LBB0_12
 ; CHECK-NEXT:  ## %bb.3: ## %bb.nph9.split
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    incl %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    leal 1(%edx), %eax
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl %ebx, %edx
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_4: ## %bb6
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movzbl (%eax,%edi,2), %ebx
-; CHECK-NEXT:    movb %bl, (%edx,%edi)
-; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    movzbl (%eax,%ebx,2), %ecx
+; CHECK-NEXT:    movb %cl, (%edx,%ebx)
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    cmpl %esi, %ebx
 ; CHECK-NEXT:    jl LBB0_4
 ; CHECK-NEXT:  ## %bb.5: ## %bb9
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    incl %ecx
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    addl %esi, %edx
-; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl %ecx, %ebp
 ; CHECK-NEXT:    je LBB0_12
 ; CHECK-NEXT:  ## %bb.6: ## %bb7.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    jmp LBB0_4
 ; CHECK-NEXT:  LBB0_12: ## %bb18.loopexit
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; CHECK-NEXT:    addl %ebp, %eax
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    cmpl $1, %ecx
 ; CHECK-NEXT:    jle LBB0_13
 ; CHECK-NEXT:  ## %bb.7: ## %bb.nph5
 ; CHECK-NEXT:    cmpl $2, %esi
 ; CHECK-NEXT:    jl LBB0_13
 ; CHECK-NEXT:  ## %bb.8: ## %bb.nph5.split
-; CHECK-NEXT:    movl %esi, %ebp
-; CHECK-NEXT:    shrl $31, %ebp
-; CHECK-NEXT:    addl %esi, %ebp
-; CHECK-NEXT:    sarl %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $31, %ecx
 ; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    sarl %ecx
 ; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    shrl $31, %edi
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    sarl %edi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; CHECK-NEXT:    movl (%esp), %eax ## 4-byte Reload
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl %edx, %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    addl $2, %edx
 ; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_9: ## %bb13
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB0_10 Depth 2
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    addl %edx, %edi
-; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addl %ebx, %edx
+; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_10: ## %bb14
 ; CHECK-NEXT:    ## Parent Loop BB0_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movzbl -2(%edi,%ebx,4), %edx
-; CHECK-NEXT:    movb %dl, (%ecx,%ebx)
-; CHECK-NEXT:    movzbl (%edi,%ebx,4), %edx
-; CHECK-NEXT:    movb %dl, (%eax,%ebx)
-; CHECK-NEXT:    incl %ebx
-; CHECK-NEXT:    cmpl %ebp, %ebx
+; CHECK-NEXT:    movzbl -2(%edx,%ebp,4), %ebx
+; CHECK-NEXT:    movb %bl, (%eax,%ebp)
+; CHECK-NEXT:    movzbl (%edx,%ebp,4), %ebx
+; CHECK-NEXT:    movb %bl, (%ecx,%ebp)
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpl %edi, %ebp
 ; CHECK-NEXT:    jl LBB0_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB0_9 Depth=1
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    addl %ebp, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    addl %edi, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; CHECK-NEXT:    addl $2, %ebx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl $2, %edx
-; CHECK-NEXT:    addl %ebp, %ecx
-; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; CHECK-NEXT:    incl %edx
+; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; CHECK-NEXT:    jl LBB0_9
 ; CHECK-NEXT:  LBB0_13: ## %bb20
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl $1, %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    cmpl $1, %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    je LBB0_19
 ; CHECK-NEXT:  ## %bb.14: ## %bb20
-; CHECK-NEXT:    cmpl $3, %ecx
+; CHECK-NEXT:    cmpl $3, %eax
 ; CHECK-NEXT:    jne LBB0_24
 ; CHECK-NEXT:  ## %bb.15: ## %bb22
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; CHECK-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; CHECK-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    jle LBB0_18
 ; CHECK-NEXT:  ## %bb.16: ## %bb.nph
-; CHECK-NEXT:    leal 15(%edx), %eax
+; CHECK-NEXT:    leal 15(%ecx), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %ebp, %ebp
-; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT:    addl %edi, %ecx
-; CHECK-NEXT:    addl %ecx, %ebp
-; CHECK-NEXT:    addl %eax, %ebx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    addl %edi, %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%esp), %ebx ## 4-byte Reload
+; CHECK-NEXT:    addl %ebx, %eax
+; CHECK-NEXT:    addl %eax, %edi
 ; CHECK-NEXT:    leal 15(%esi), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -155,65 +155,69 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    movl %ebp, %edi
-; CHECK-NEXT:    movl %ebx, %ebp
-; CHECK-NEXT:    movl %edx, %ebx
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl %ecx, %edi
 ; CHECK-NEXT:    calll _memcpy
-; CHECK-NEXT:    movl %ebx, %edx
-; CHECK-NEXT:    movl %ebp, %ebx
-; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %ebx, %edi
+; CHECK-NEXT:    movl %ebp, %edx
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %esi, %ebp
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %edx
+; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    jne LBB0_17
 ; CHECK-NEXT:  LBB0_18: ## %bb26
-; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; CHECK-NEXT:    addl %ecx, %esi
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    movl (%esp), %esi ## 4-byte Reload
 ; CHECK-NEXT:    addl %esi, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    addl %esi, %eax
 ; CHECK-NEXT:    jmp LBB0_23
 ; CHECK-NEXT:  LBB0_19: ## %bb29
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    jle LBB0_22
 ; CHECK-NEXT:  ## %bb.20: ## %bb.nph11
 ; CHECK-NEXT:    leal 15(%esi), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_21: ## %bb30
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    movl %ebx, %ebp
-; CHECK-NEXT:    movl %edx, %ebx
+; CHECK-NEXT:    addl %esi, %ebx
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movl %ecx, %edi
 ; CHECK-NEXT:    calll _memcpy
-; CHECK-NEXT:    movl %ebx, %edx
-; CHECK-NEXT:    movl %ebp, %ebx
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %ebp, %edx
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %esi, %edi
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %edx
+; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    jne LBB0_21
 ; CHECK-NEXT:  LBB0_22: ## %bb33
-; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    addl %ecx, %edx
-; CHECK-NEXT:  LBB0_23: ## %bb33
-; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    shrl $31, %eax
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:  LBB0_23: ## %bb33
 ; CHECK-NEXT:    sarl %eax
-; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $128
-; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $44, %esp
 ; CHECK-NEXT:  LBB0_25: ## %return
@@ -457,30 +461,26 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $28, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ebp, %edx
-; CHECK-NEXT:    imull %eax, %edx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    imull %ebp, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    je LBB1_19
 ; CHECK-NEXT:  ## %bb.1: ## %bb10.preheader
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shrl $2, %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    testl %ebp, %ebp
-; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    je LBB1_12
 ; CHECK-NEXT:  ## %bb.2: ## %bb.nph9
-; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    testl %ebp, %ebp
 ; CHECK-NEXT:    je LBB1_12
 ; CHECK-NEXT:  ## %bb.3: ## %bb.nph9.split
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    leal 1(%edi), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movl %esi, %edx
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_4: ## %bb6
@@ -488,14 +488,14 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movzbl (%eax,%esi,2), %ebx
 ; CHECK-NEXT:    movb %bl, (%edx,%esi)
 ; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %edi, %esi
+; CHECK-NEXT:    cmpl %ebp, %esi
 ; CHECK-NEXT:    jb LBB1_4
 ; CHECK-NEXT:  ## %bb.5: ## %bb9
 ; CHECK-NEXT:    ## in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    addl %ebp, %edx
 ; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    addl %edi, %edx
-; CHECK-NEXT:    cmpl %ebp, %ecx
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    je LBB1_12
 ; CHECK-NEXT:  ## %bb.6: ## %bb7.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB1_4 Depth=1
@@ -506,32 +506,32 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    cmpl $1, %ebp
+; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    jbe LBB1_13
 ; CHECK-NEXT:  ## %bb.7: ## %bb.nph5
-; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    cmpl $2, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    jb LBB1_13
 ; CHECK-NEXT:  ## %bb.8: ## %bb.nph5.split
-; CHECK-NEXT:    movl %edi, %ebp
-; CHECK-NEXT:    shrl %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    shrl %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    shrl %edi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    addl $2, %edx
 ; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_9: ## %bb13
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB1_10 Depth 2
-; CHECK-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; CHECK-NEXT:    movl %ebp, %edx
 ; CHECK-NEXT:    andl $1, %edx
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    addl %esi, %edx
@@ -543,47 +543,44 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    ## Parent Loop BB1_9 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    movzbl -2(%edx,%esi,4), %ebx
-; CHECK-NEXT:    movb %bl, (%eax,%esi)
-; CHECK-NEXT:    movzbl (%edx,%esi,4), %ebx
 ; CHECK-NEXT:    movb %bl, (%ecx,%esi)
+; CHECK-NEXT:    movzbl (%edx,%esi,4), %ebx
+; CHECK-NEXT:    movb %bl, (%eax,%esi)
 ; CHECK-NEXT:    incl %esi
-; CHECK-NEXT:    cmpl %ebp, %esi
+; CHECK-NEXT:    cmpl %edi, %esi
 ; CHECK-NEXT:    jb LBB1_10
 ; CHECK-NEXT:  ## %bb.11: ## %bb17
 ; CHECK-NEXT:    ## in Loop: Header=BB1_9 Depth=1
-; CHECK-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT:    incl %edx
-; CHECK-NEXT:    addl %ebp, %ecx
+; CHECK-NEXT:    addl %edi, %ecx
+; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; CHECK-NEXT:    addl $2, %esi
-; CHECK-NEXT:    addl %ebp, %eax
-; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; CHECK-NEXT:    jb LBB1_9
 ; CHECK-NEXT:  LBB1_13: ## %bb20
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    cmpl $1, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    cmpl $1, %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    je LBB1_19
 ; CHECK-NEXT:  ## %bb.14: ## %bb20
-; CHECK-NEXT:    cmpl $3, %esi
+; CHECK-NEXT:    cmpl $3, %edx
 ; CHECK-NEXT:    jne LBB1_24
 ; CHECK-NEXT:  ## %bb.15: ## %bb22
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; CHECK-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; CHECK-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    je LBB1_18
 ; CHECK-NEXT:  ## %bb.16: ## %bb.nph
-; CHECK-NEXT:    movl %ebp, %esi
-; CHECK-NEXT:    leal 15(%ebp), %eax
+; CHECK-NEXT:    leal 15(%ecx), %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    imull {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    addl $15, %edx
-; CHECK-NEXT:    andl $-16, %edx
-; CHECK-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    leal 15(%edx), %ebx
+; CHECK-NEXT:    andl $-16, %ebx
+; CHECK-NEXT:    addl %eax, %edi
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    leal (%edx,%eax), %ebp
@@ -591,61 +588,62 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:  LBB1_17: ## %bb23
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    addl %ebx, %edi
 ; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    addl %eax, %ebp
+; CHECK-NEXT:    movl %ecx, %esi
 ; CHECK-NEXT:    calll _memcpy
-; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %ebx, %ebp
-; CHECK-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %esi
+; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    jne LBB1_17
 ; CHECK-NEXT:  LBB1_18: ## %bb26
+; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    shrl %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    shrl %ecx
-; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl $128
 ; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    jmp LBB1_23
 ; CHECK-NEXT:  LBB1_19: ## %bb29
-; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    testl %ecx, %ecx
 ; CHECK-NEXT:    je LBB1_22
 ; CHECK-NEXT:  ## %bb.20: ## %bb.nph11
 ; CHECK-NEXT:    movl %ebp, %esi
-; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    addl $15, %eax
+; CHECK-NEXT:    leal 15(%ebp), %eax
 ; CHECK-NEXT:    andl $-16, %eax
-; CHECK-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB1_21: ## %bb30
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    movl %ecx, %ebx
+; CHECK-NEXT:    addl %esi, %ebp
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    movl %ecx, %edi
 ; CHECK-NEXT:    calll _memcpy
-; CHECK-NEXT:    movl %ebx, %ecx
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    movl %ebx, %edi
 ; CHECK-NEXT:    addl $16, %esp
-; CHECK-NEXT:    addl %edi, %ebp
-; CHECK-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; CHECK-NEXT:    decl %esi
+; CHECK-NEXT:    decl %ecx
 ; CHECK-NEXT:    jne LBB1_21
 ; CHECK-NEXT:  LBB1_22: ## %bb33
+; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    shrl %eax
-; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $128
 ; CHECK-NEXT:    pushl %ecx
diff --git a/llvm/test/CodeGen/X86/or-lea.ll b/llvm/test/CodeGen/X86/or-lea.ll
index 616ab99437892..22f5ed810cebc 100644
--- a/llvm/test/CodeGen/X86/or-lea.ll
+++ b/llvm/test/CodeGen/X86/or-lea.ll
@@ -12,9 +12,9 @@ define i32 @or_shift1_and1(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift1_and1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    leal (%ecx,%eax,2), %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_shift1_and1:
@@ -34,9 +34,9 @@ define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift1_and1_swapped:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    leal (%ecx,%eax,2), %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_shift1_and1_swapped:
@@ -56,9 +56,9 @@ define i32 @or_shift2_and1(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift2_and1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_shift2_and1:
@@ -78,9 +78,9 @@ define i32 @or_shift3_and1(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift3_and1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    leal (%ecx,%eax,8), %eax
+; X86-NEXT:    leal (%eax,%ecx,8), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_shift3_and1:
@@ -100,9 +100,9 @@ define i32 @or_shift3_and7(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift3_and7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $7, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $7, %ecx
-; X86-NEXT:    leal (%ecx,%eax,8), %eax
+; X86-NEXT:    leal (%eax,%ecx,8), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_shift3_and7:
@@ -123,10 +123,10 @@ define i32 @or_shift3_and7(i32 %x, i32 %y) {
 define i32 @or_shift4_and1(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift4_and1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -149,10 +149,10 @@ define i32 @or_shift4_and1(i32 %x, i32 %y) {
 define i32 @or_shift3_and8(i32 %x, i32 %y) {
 ; X86-LABEL: or_shift3_and8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $3, %ecx
-; X86-NEXT:    andl $8, %eax
+; X86-NEXT:    andl $8, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -175,11 +175,11 @@ define i64 @or_shift1_and1_64(i64 %x, i64 %y) {
 ; X86-LABEL: or_shift1_and1_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    shldl $1, %ecx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: or_shift1_and1_64:
diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll
index 5900e7674cd0e..cac30e30bb84f 100644
--- a/llvm/test/CodeGen/X86/overflow.ll
+++ b/llvm/test/CodeGen/X86/overflow.ll
@@ -27,23 +27,24 @@ define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind {
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb %al
+; X32-NEXT:    movzbl %al, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl %bl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    adcl %edi, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    andl $1, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    movl %esi, (%ecx)
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, 4(%eax)
-; X32-NEXT:    setb %cl
-; X32-NEXT:    movzbl %cl, %ecx
-; X32-NEXT:    movl %ecx, 8(%eax)
-; X32-NEXT:    movl $0, 12(%eax)
+; X32-NEXT:    movl %edx, 4(%ecx)
+; X32-NEXT:    setb %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl %eax, 8(%ecx)
+; X32-NEXT:    movl $0, 12(%ecx)
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index 91e4b9b463b0a..468da6853a5b6 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -164,22 +164,22 @@ define <8 x i16> @trunc_ashr_v8i32(<8 x i32> %a) nounwind {
 define <8 x i16> @trunc_ashr_v4i32_icmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; X86-SSE-LABEL: trunc_ashr_v4i32_icmp_v4i32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    psrad $31, %xmm0
 ; X86-SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    psrad $31, %xmm0
 ; X86-SSE-NEXT:    packssdw %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: trunc_ashr_v4i32_icmp_v4i32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: trunc_ashr_v4i32_icmp_v4i32:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    retl
 ;
@@ -212,30 +212,6 @@ define <8 x i16> @trunc_ashr_v4i32_icmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwi
 }
 
 define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
-; SSE2-LABEL: trunc_ashr_v4i64_demandedelts:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    psllq $63, %xmm0
-; SSE2-NEXT:    psllq $63, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    packssdw %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE4-LABEL: trunc_ashr_v4i64_demandedelts:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    psllq $63, %xmm0
-; SSE4-NEXT:    pxor %xmm2, %xmm2
-; SSE4-NEXT:    pxor %xmm3, %xmm3
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
-; SSE4-NEXT:    psllq $63, %xmm1
-; SSE4-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
-; SSE4-NEXT:    packssdw %xmm1, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
-;
 ; AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
@@ -334,28 +310,28 @@ define <16 x i8> @packsswb_icmp_zero_trunc_128(<8 x i16> %a0) {
 }
 
 define <32 x i8> @packsswb_icmp_zero_256(<16 x i16> %a0) {
-; SSE-LABEL: packsswb_icmp_zero_256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    packsswb %xmm0, %xmm3
-; SSE-NEXT:    packsswb %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-LABEL: packsswb_icmp_zero_256:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X86-SSE-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE-NEXT:    packsswb %xmm0, %xmm3
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X86-SSE-NEXT:    packsswb %xmm1, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE-NEXT:    retl
 ;
-; AVX1-LABEL: packsswb_icmp_zero_256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: packsswb_icmp_zero_256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpacksswb %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpacksswb %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
 ;
 ; AVX2-LABEL: packsswb_icmp_zero_256:
 ; AVX2:       # %bb.0:
@@ -370,6 +346,29 @@ define <32 x i8> @packsswb_icmp_zero_256(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpacksswb %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-SSE-LABEL: packsswb_icmp_zero_256:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X64-SSE-NEXT:    pxor %xmm3, %xmm3
+; X64-SSE-NEXT:    packsswb %xmm0, %xmm3
+; X64-SSE-NEXT:    packsswb %xmm1, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: packsswb_icmp_zero_256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    retq
   %1 = icmp eq <16 x i16> %a0, zeroinitializer
   %2 = sext <16 x i1> %1 to <16 x i16>
   %3 = bitcast <16 x i16> %2 to <32 x i8>
@@ -378,28 +377,28 @@ define <32 x i8> @packsswb_icmp_zero_256(<16 x i16> %a0) {
 }
 
 define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) {
-; SSE-LABEL: packsswb_icmp_zero_trunc_256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    packsswb %xmm0, %xmm3
-; SSE-NEXT:    packsswb %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-LABEL: packsswb_icmp_zero_trunc_256:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X86-SSE-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE-NEXT:    packsswb %xmm0, %xmm3
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X86-SSE-NEXT:    packsswb %xmm1, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE-NEXT:    retl
 ;
-; AVX1-LABEL: packsswb_icmp_zero_trunc_256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: packsswb_icmp_zero_trunc_256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpacksswb %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpacksswb %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
 ;
 ; AVX2-LABEL: packsswb_icmp_zero_trunc_256:
 ; AVX2:       # %bb.0:
@@ -418,6 +417,29 @@ define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
 ; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-SSE-LABEL: packsswb_icmp_zero_trunc_256:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X64-SSE-NEXT:    pxor %xmm3, %xmm3
+; X64-SSE-NEXT:    packsswb %xmm0, %xmm3
+; X64-SSE-NEXT:    packsswb %xmm1, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: packsswb_icmp_zero_trunc_256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpacksswb %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    retq
   %1 = icmp eq <16 x i16> %a0, zeroinitializer
   %2 = sext <16 x i1> %1 to <16 x i16>
   %3 = shufflevector <16 x i16> zeroinitializer, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -455,14 +477,14 @@ define <32 x i8> @_mm256_packss_epi16_manual(<16 x i16> %a, <16 x i16> %b) nounw
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl
 ;
-; AVX1-LABEL: _mm256_packss_epi16_manual:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: _mm256_packss_epi16_manual:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpacksswb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
 ;
 ; AVX2-LABEL: _mm256_packss_epi16_manual:
 ; AVX2:       # %bb.0:
@@ -479,6 +501,15 @@ define <32 x i8> @_mm256_packss_epi16_manual(<16 x i16> %a, <16 x i16> %b) nounw
 ; X64-SSE-NEXT:    packsswb %xmm2, %xmm0
 ; X64-SSE-NEXT:    packsswb %xmm3, %xmm1
 ; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: _mm256_packss_epi16_manual:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    retq
   %sh   = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %minv = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 -128))
   %sat  = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> %minv, <32 x i16> splat (i16 127))
@@ -493,10 +524,10 @@ define <64 x i8> @_mm512_packss_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw
 ; X86-SSE-NEXT:    movl %esp, %ebp
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
-; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    packsswb 24(%ebp), %xmm0
 ; X86-SSE-NEXT:    packsswb 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    packsswb 56(%ebp), %xmm2
+; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    packsswb 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -508,15 +539,15 @@ define <64 x i8> @_mm512_packss_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
 ; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; X86-AVX1-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vpacksswb 8(%ebp), %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpacksswb 24(%ebp), %xmm3, %xmm2
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vpacksswb 8(%ebp), %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vpacksswb 24(%ebp), %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
 ; X86-AVX1-NEXT:    retl
@@ -602,14 +633,14 @@ define <16 x i16> @_mm256_packss_epi32_manual(<8 x i32> %a, <8 x i32> %b) nounwi
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl
 ;
-; AVX1-LABEL: _mm256_packss_epi32_manual:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: _mm256_packss_epi32_manual:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
 ;
 ; AVX2-LABEL: _mm256_packss_epi32_manual:
 ; AVX2:       # %bb.0:
@@ -626,6 +657,15 @@ define <16 x i16> @_mm256_packss_epi32_manual(<8 x i32> %a, <8 x i32> %b) nounwi
 ; X64-SSE-NEXT:    packssdw %xmm2, %xmm0
 ; X64-SSE-NEXT:    packssdw %xmm3, %xmm1
 ; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: _mm256_packss_epi32_manual:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    retq
   %sh   = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %minv = tail call <16 x i32> @llvm.smax.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 -32768))
   %sat  = tail call <16 x i32> @llvm.smin.v16i32(<16 x i32> %minv, <16 x i32> splat (i32 32767))
@@ -640,10 +680,10 @@ define <32 x i16> @_mm512_packss_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ; X86-SSE-NEXT:    movl %esp, %ebp
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
-; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    packssdw 24(%ebp), %xmm0
 ; X86-SSE-NEXT:    packssdw 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    packssdw 56(%ebp), %xmm2
+; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    packssdw 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -655,15 +695,15 @@ define <32 x i16> @_mm512_packss_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
 ; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; X86-AVX1-NEXT:    vpackssdw %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpackssdw %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vpackssdw 8(%ebp), %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpackssdw 24(%ebp), %xmm3, %xmm2
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vpackssdw 8(%ebp), %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vpackssdw 24(%ebp), %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
 ; X86-AVX1-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index e678991fcec78..3c0475f56cbf9 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -213,19 +213,47 @@ define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
 }
 
 define <16 x i8> @shuffle_lshr_2v8i16(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: shuffle_lshr_2v8i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    psrlw $15, %xmm0
-; SSE-NEXT:    psrlw $15, %xmm1
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-LABEL: shuffle_lshr_2v8i16:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    psrlw $15, %xmm1
+; X86-SSE-NEXT:    psrlw $15, %xmm0
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X86-SSE-NEXT:    retl
 ;
-; AVX-LABEL: shuffle_lshr_2v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $15, %xmm1, %xmm1
-; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    ret{{[l|q]}}
+; X64-SSE-LABEL: shuffle_lshr_2v8i16:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    psrlw $15, %xmm0
+; X64-SSE-NEXT:    psrlw $15, %xmm1
+; X64-SSE-NEXT:    packuswb %xmm1, %xmm0
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: shuffle_lshr_2v8i16:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: shuffle_lshr_2v8i16:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+;
+; X86-AVX512-LABEL: shuffle_lshr_2v8i16:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: shuffle_lshr_2v8i16:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
   %lshr0 = lshr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %lshr1 = lshr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %bc0 = bitcast <8 x i16> %lshr0 to <16 x i8>
@@ -235,26 +263,61 @@ define <16 x i8> @shuffle_lshr_2v8i16(<8 x i16> %a0, <8 x i16> %a1) {
 }
 
 define <8 x i16> @shuffle_lshr_2v4i32(<4 x i32> %a0, <4 x i32> %a1) {
-; SSE2-LABEL: shuffle_lshr_2v4i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrld $31, %xmm0
-; SSE2-NEXT:    psrld $31, %xmm1
-; SSE2-NEXT:    packssdw %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: shuffle_lshr_2v4i32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    psrld $31, %xmm1
+; X86-SSE2-NEXT:    psrld $31, %xmm0
+; X86-SSE2-NEXT:    packssdw %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
-; SSE4-LABEL: shuffle_lshr_2v4i32:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    psrld $31, %xmm0
-; SSE4-NEXT:    psrld $31, %xmm1
-; SSE4-NEXT:    packusdw %xmm1, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; X64-SSE2-LABEL: shuffle_lshr_2v4i32:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    psrld $31, %xmm0
+; X64-SSE2-NEXT:    psrld $31, %xmm1
+; X64-SSE2-NEXT:    packssdw %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_lshr_2v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $31, %xmm1, %xmm1
-; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    ret{{[l|q]}}
+; X86-SSE4-LABEL: shuffle_lshr_2v4i32:
+; X86-SSE4:       # %bb.0:
+; X86-SSE4-NEXT:    psrld $31, %xmm1
+; X86-SSE4-NEXT:    psrld $31, %xmm0
+; X86-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X86-SSE4-NEXT:    retl
+;
+; X64-SSE4-LABEL: shuffle_lshr_2v4i32:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    psrld $31, %xmm0
+; X64-SSE4-NEXT:    psrld $31, %xmm1
+; X64-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X64-SSE4-NEXT:    retq
+;
+; X86-AVX-LABEL: shuffle_lshr_2v4i32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vpsrld $31, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: shuffle_lshr_2v4i32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $31, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+;
+; X86-AVX512-LABEL: shuffle_lshr_2v4i32:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vpsrld $31, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: shuffle_lshr_2v4i32:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrld $31, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
   %lshr0 = lshr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
   %lshr1 = lshr <4 x i32> %a1, <i32 31, i32 31, i32 31, i32 31>
   %bc0 = bitcast <4 x i32> %lshr0 to <8 x i16>
@@ -264,26 +327,61 @@ define <8 x i16> @shuffle_lshr_2v4i32(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <4 x i32> @shuffle_lshr_2v2i64(<2 x i64> %a0, <2 x i64> %a1) {
-; SSE2-LABEL: shuffle_lshr_2v2i64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrlq $63, %xmm0
-; SSE2-NEXT:    psrlq $63, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: shuffle_lshr_2v2i64:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    psrlq $63, %xmm1
+; X86-SSE2-NEXT:    psrlq $63, %xmm0
+; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
-; SSE4-LABEL: shuffle_lshr_2v2i64:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    psrlq $63, %xmm0
-; SSE4-NEXT:    psrlq $63, %xmm1
-; SSE4-NEXT:    packusdw %xmm1, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; X64-SSE2-LABEL: shuffle_lshr_2v2i64:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    psrlq $63, %xmm0
+; X64-SSE2-NEXT:    psrlq $63, %xmm1
+; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_lshr_2v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlq $63, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlq $63, %xmm1, %xmm1
-; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    ret{{[l|q]}}
+; X86-SSE4-LABEL: shuffle_lshr_2v2i64:
+; X86-SSE4:       # %bb.0:
+; X86-SSE4-NEXT:    psrlq $63, %xmm1
+; X86-SSE4-NEXT:    psrlq $63, %xmm0
+; X86-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X86-SSE4-NEXT:    retl
+;
+; X64-SSE4-LABEL: shuffle_lshr_2v2i64:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    psrlq $63, %xmm0
+; X64-SSE4-NEXT:    psrlq $63, %xmm1
+; X64-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X64-SSE4-NEXT:    retq
+;
+; X86-AVX-LABEL: shuffle_lshr_2v2i64:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: shuffle_lshr_2v2i64:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+;
+; X86-AVX512-LABEL: shuffle_lshr_2v2i64:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: shuffle_lshr_2v2i64:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
   %lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
   %lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
   %bc0 = bitcast <2 x i64> %lshr0 to <4 x i32>
@@ -293,26 +391,61 @@ define <4 x i32> @shuffle_lshr_2v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 }
 
 define <4 x float> @shuffle_lshr_2v2i64_bitcast(<2 x i64> %a0, <2 x i64> %a1) {
-; SSE2-LABEL: shuffle_lshr_2v2i64_bitcast:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    psrlq $63, %xmm0
-; SSE2-NEXT:    psrlq $63, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm0
-; SSE2-NEXT:    ret{{[l|q]}}
+; X86-SSE2-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    psrlq $63, %xmm1
+; X86-SSE2-NEXT:    psrlq $63, %xmm0
+; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X86-SSE2-NEXT:    retl
 ;
-; SSE4-LABEL: shuffle_lshr_2v2i64_bitcast:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    psrlq $63, %xmm0
-; SSE4-NEXT:    psrlq $63, %xmm1
-; SSE4-NEXT:    packusdw %xmm1, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; X64-SSE2-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    psrlq $63, %xmm0
+; X64-SSE2-NEXT:    psrlq $63, %xmm1
+; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
+; X64-SSE2-NEXT:    retq
 ;
-; AVX-LABEL: shuffle_lshr_2v2i64_bitcast:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlq $63, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlq $63, %xmm1, %xmm1
-; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    ret{{[l|q]}}
+; X86-SSE4-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X86-SSE4:       # %bb.0:
+; X86-SSE4-NEXT:    psrlq $63, %xmm1
+; X86-SSE4-NEXT:    psrlq $63, %xmm0
+; X86-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X86-SSE4-NEXT:    retl
+;
+; X64-SSE4-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    psrlq $63, %xmm0
+; X64-SSE4-NEXT:    psrlq $63, %xmm1
+; X64-SSE4-NEXT:    packusdw %xmm1, %xmm0
+; X64-SSE4-NEXT:    retq
+;
+; X86-AVX-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+;
+; X86-AVX512-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: shuffle_lshr_2v2i64_bitcast:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpsrlq $63, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlq $63, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
   %lshr0 = lshr <2 x i64> %a0, <i64 63, i64 63>
   %lshr1 = lshr <2 x i64> %a1, <i64 63, i64 63>
   %bc0 = bitcast <2 x i64> %lshr0 to <4 x float>
@@ -394,19 +527,33 @@ define <16 x i8> @packuswb_icmp_zero_trunc_128(<8 x i16> %a0) {
 }
 
 define <32 x i8> @packuswb_icmp_zero_256(<16 x i16> %a0) {
-; SSE-LABEL: packuswb_icmp_zero_256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
-; SSE-NEXT:    psrlw $15, %xmm1
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
-; SSE-NEXT:    psrlw $15, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    packuswb %xmm0, %xmm3
-; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-LABEL: packuswb_icmp_zero_256:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X86-SSE-NEXT:    psrlw $15, %xmm0
+; X86-SSE-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE-NEXT:    packuswb %xmm0, %xmm3
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X86-SSE-NEXT:    psrlw $15, %xmm1
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: packuswb_icmp_zero_256:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X64-SSE-NEXT:    psrlw $15, %xmm1
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X64-SSE-NEXT:    psrlw $15, %xmm0
+; X64-SSE-NEXT:    pxor %xmm3, %xmm3
+; X64-SSE-NEXT:    packuswb %xmm0, %xmm3
+; X64-SSE-NEXT:    packuswb %xmm1, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: packuswb_icmp_zero_256:
 ; X86-AVX1:       # %bb.0:
@@ -459,32 +606,59 @@ define <32 x i8> @packuswb_icmp_zero_256(<16 x i16> %a0) {
 }
 
 define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
-; SSE-LABEL: packuswb_icmp_zero_trunc_256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
-; SSE-NEXT:    psrlw $15, %xmm1
-; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
-; SSE-NEXT:    psrlw $15, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    packuswb %xmm0, %xmm3
-; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-LABEL: packuswb_icmp_zero_trunc_256:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pxor %xmm2, %xmm2
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X86-SSE-NEXT:    psrlw $15, %xmm0
+; X86-SSE-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE-NEXT:    packuswb %xmm0, %xmm3
+; X86-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X86-SSE-NEXT:    psrlw $15, %xmm1
+; X86-SSE-NEXT:    packuswb %xmm1, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE-NEXT:    retl
 ;
-; AVX1-LABEL: packuswb_icmp_zero_trunc_256:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsrlw $15, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X64-SSE-LABEL: packuswb_icmp_zero_trunc_256:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pxor %xmm2, %xmm2
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; X64-SSE-NEXT:    psrlw $15, %xmm1
+; X64-SSE-NEXT:    pcmpeqw %xmm2, %xmm0
+; X64-SSE-NEXT:    psrlw $15, %xmm0
+; X64-SSE-NEXT:    pxor %xmm3, %xmm3
+; X64-SSE-NEXT:    packuswb %xmm0, %xmm3
+; X64-SSE-NEXT:    packuswb %xmm1, %xmm2
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: packuswb_icmp_zero_trunc_256:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpsrlw $15, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: packuswb_icmp_zero_trunc_256:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $15, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: packuswb_icmp_zero_trunc_256:
 ; AVX2:       # %bb.0:
@@ -547,14 +721,23 @@ define <32 x i8> @_mm256_packus_epi16_manual(<16 x i16> %a, <16 x i16> %b) nounw
 ; X64-SSE-NEXT:    packuswb %xmm3, %xmm1
 ; X64-SSE-NEXT:    retq
 ;
-; AVX1-LABEL: _mm256_packus_epi16_manual:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: _mm256_packus_epi16_manual:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: _mm256_packus_epi16_manual:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: _mm256_packus_epi16_manual:
 ; AVX2:       # %bb.0:
@@ -579,10 +762,10 @@ define <64 x i8> @_mm512_packus_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw
 ; X86-SSE-NEXT:    movl %esp, %ebp
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
-; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    packuswb 24(%ebp), %xmm0
 ; X86-SSE-NEXT:    packuswb 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    packuswb 56(%ebp), %xmm2
+; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    packuswb 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -602,15 +785,15 @@ define <64 x i8> @_mm512_packus_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
 ; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; X86-AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vpackuswb 8(%ebp), %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpackuswb 24(%ebp), %xmm3, %xmm2
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vpackuswb 8(%ebp), %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vpackuswb 24(%ebp), %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
 ; X86-AVX1-NEXT:    retl
@@ -662,25 +845,25 @@ define <8 x i16> @_mm_packus_epi32_manual(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; X86-SSE2-LABEL: _mm_packus_epi32_manual:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    por %xmm4, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
 ; X86-SSE2-NEXT:    pslld $16, %xmm1
 ; X86-SSE2-NEXT:    psrad $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pslld $16, %xmm0
 ; X86-SSE2-NEXT:    psrad $16, %xmm0
 ; X86-SSE2-NEXT:    packssdw %xmm1, %xmm0
@@ -736,51 +919,51 @@ define <16 x i16> @_mm256_packus_epi32_manual(<8 x i32> %a, <8 x i32> %b) nounwi
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm4
-; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; X86-SSE2-NEXT:    pand %xmm4, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm5, %xmm2
-; X86-SSE2-NEXT:    por %xmm2, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm5, %xmm2
-; X86-SSE2-NEXT:    por %xmm6, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    por %xmm6, %xmm5
+; X86-SSE2-NEXT:    pslld $16, %xmm5
+; X86-SSE2-NEXT:    psrad $16, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pand %xmm6, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    por %xmm6, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm5, %xmm4
-; X86-SSE2-NEXT:    por %xmm3, %xmm4
-; X86-SSE2-NEXT:    pslld $16, %xmm4
-; X86-SSE2-NEXT:    psrad $16, %xmm4
 ; X86-SSE2-NEXT:    pslld $16, %xmm0
 ; X86-SSE2-NEXT:    psrad $16, %xmm0
-; X86-SSE2-NEXT:    packssdw %xmm4, %xmm0
-; X86-SSE2-NEXT:    pslld $16, %xmm2
-; X86-SSE2-NEXT:    psrad $16, %xmm2
+; X86-SSE2-NEXT:    packssdw %xmm5, %xmm0
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    por %xmm6, %xmm5
+; X86-SSE2-NEXT:    pslld $16, %xmm5
+; X86-SSE2-NEXT:    psrad $16, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pslld $16, %xmm1
 ; X86-SSE2-NEXT:    psrad $16, %xmm1
-; X86-SSE2-NEXT:    packssdw %xmm2, %xmm1
+; X86-SSE2-NEXT:    packssdw %xmm5, %xmm1
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -851,14 +1034,23 @@ define <16 x i16> @_mm256_packus_epi32_manual(<8 x i32> %a, <8 x i32> %b) nounwi
 ; X64-SSE4-NEXT:    packusdw %xmm3, %xmm1
 ; X64-SSE4-NEXT:    retq
 ;
-; AVX1-LABEL: _mm256_packus_epi32_manual:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: _mm256_packus_epi32_manual:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: _mm256_packus_epi32_manual:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: _mm256_packus_epi32_manual:
 ; AVX2:       # %bb.0:
@@ -882,114 +1074,98 @@ define <32 x i16> @_mm512_packus_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
-; X86-SSE2-NEXT:    subl $64, %esp
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm0
-; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm1
-; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
-; X86-SSE2-NEXT:    pand %xmm5, %xmm7
-; X86-SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm7
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; X86-SSE2-NEXT:    pand %xmm7, %xmm5
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm3, %xmm7
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm5
-; X86-SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
-; X86-SSE2-NEXT:    pxor %xmm7, %xmm4
-; X86-SSE2-NEXT:    por %xmm5, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm7, %xmm5
-; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
-; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    por %xmm0, %xmm7
-; X86-SSE2-NEXT:    movdqa %xmm7, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
-; X86-SSE2-NEXT:    pxor %xmm7, %xmm0
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm7, %xmm1
-; X86-SSE2-NEXT:    por %xmm1, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
-; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm7
-; X86-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pslld $16, %xmm6
+; X86-SSE2-NEXT:    psrad $16, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm7
+; X86-SSE2-NEXT:    por %xmm7, %xmm0
+; X86-SSE2-NEXT:    pslld $16, %xmm0
+; X86-SSE2-NEXT:    psrad $16, %xmm0
+; X86-SSE2-NEXT:    packssdw %xmm6, %xmm0
+; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pslld $16, %xmm6
+; X86-SSE2-NEXT:    psrad $16, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm7
 ; X86-SSE2-NEXT:    por %xmm7, %xmm1
+; X86-SSE2-NEXT:    pslld $16, %xmm1
+; X86-SSE2-NEXT:    psrad $16, %xmm1
+; X86-SSE2-NEXT:    packssdw %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pslld $16, %xmm6
+; X86-SSE2-NEXT:    psrad $16, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm7
 ; X86-SSE2-NEXT:    pand %xmm7, %xmm2
-; X86-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm7
 ; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
+; X86-SSE2-NEXT:    pslld $16, %xmm2
+; X86-SSE2-NEXT:    psrad $16, %xmm2
+; X86-SSE2-NEXT:    packssdw %xmm6, %xmm2
+; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm7
+; X86-SSE2-NEXT:    pand %xmm4, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm7
-; X86-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm5, %xmm3
 ; X86-SSE2-NEXT:    por %xmm7, %xmm3
-; X86-SSE2-NEXT:    pslld $16, %xmm3
-; X86-SSE2-NEXT:    psrad $16, %xmm3
-; X86-SSE2-NEXT:    pslld $16, %xmm2
-; X86-SSE2-NEXT:    psrad $16, %xmm2
-; X86-SSE2-NEXT:    packssdw %xmm3, %xmm2
-; X86-SSE2-NEXT:    pslld $16, %xmm1
-; X86-SSE2-NEXT:    psrad $16, %xmm1
 ; X86-SSE2-NEXT:    pslld $16, %xmm6
 ; X86-SSE2-NEXT:    psrad $16, %xmm6
-; X86-SSE2-NEXT:    packssdw %xmm1, %xmm6
-; X86-SSE2-NEXT:    pslld $16, %xmm0
-; X86-SSE2-NEXT:    psrad $16, %xmm0
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm3 # 16-byte Reload
 ; X86-SSE2-NEXT:    pslld $16, %xmm3
 ; X86-SSE2-NEXT:    psrad $16, %xmm3
-; X86-SSE2-NEXT:    packssdw %xmm0, %xmm3
-; X86-SSE2-NEXT:    pslld $16, %xmm5
-; X86-SSE2-NEXT:    psrad $16, %xmm5
-; X86-SSE2-NEXT:    pslld $16, %xmm4
-; X86-SSE2-NEXT:    psrad $16, %xmm4
-; X86-SSE2-NEXT:    packssdw %xmm5, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT:    packssdw %xmm6, %xmm3
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
@@ -1090,10 +1266,10 @@ define <32 x i16> @_mm512_packus_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ; X86-SSE4-NEXT:    movl %esp, %ebp
 ; X86-SSE4-NEXT:    andl $-16, %esp
 ; X86-SSE4-NEXT:    subl $16, %esp
-; X86-SSE4-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE4-NEXT:    packusdw 24(%ebp), %xmm0
 ; X86-SSE4-NEXT:    packusdw 40(%ebp), %xmm1
 ; X86-SSE4-NEXT:    packusdw 56(%ebp), %xmm2
+; X86-SSE4-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE4-NEXT:    packusdw 72(%ebp), %xmm3
 ; X86-SSE4-NEXT:    movl %ebp, %esp
 ; X86-SSE4-NEXT:    popl %ebp
@@ -1113,15 +1289,15 @@ define <32 x i16> @_mm512_packus_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
 ; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; X86-AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vpackusdw 8(%ebp), %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpackusdw 24(%ebp), %xmm3, %xmm2
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vpackusdw 8(%ebp), %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vpackusdw 24(%ebp), %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
 ; X86-AVX1-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/partial-fold32.ll b/llvm/test/CodeGen/X86/partial-fold32.ll
index 7fc1ed3521e9d..bdb50693107ee 100644
--- a/llvm/test/CodeGen/X86/partial-fold32.ll
+++ b/llvm/test/CodeGen/X86/partial-fold32.ll
@@ -1,9 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=i686-unknown-linux-gnu -enable-misched=false < %s | FileCheck %s
 
 define fastcc i8 @fold32to8(i32 %add, i8 %spill) {
 ; CHECK-LABEL: fold32to8:
-; CHECK:    movl %ecx, (%esp) # 4-byte Spill
-; CHECK:    subb (%esp), %dl  # 1-byte Folded Reload
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    .cfi_offset %esi, -20
+; CHECK-NEXT:    .cfi_offset %edi, -16
+; CHECK-NEXT:    .cfi_offset %ebx, -12
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    subb %cl, %dl
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
 entry:
   tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
   %trunc = trunc i32 %add to i8
@@ -14,9 +39,33 @@ entry:
 ; Do not fold a 1-byte store into a 4-byte spill slot
 define fastcc i8 @nofold(i32 %add, i8 %spill) {
 ; CHECK-LABEL: nofold:
-; CHECK:    movl %edx, (%esp) # 4-byte Spill
-; CHECK:    movl (%esp), %eax # 4-byte Reload
-; CHECK:    subb %cl, %al
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    .cfi_offset %esi, -20
+; CHECK-NEXT:    .cfi_offset %edi, -16
+; CHECK-NEXT:    .cfi_offset %ebx, -12
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    subb %cl, %dl
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
 entry:
   tail call void asm sideeffect "", "~{eax},~{ebx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
   %trunc = trunc i32 %add to i8
diff --git a/llvm/test/CodeGen/X86/peep-test-3.ll b/llvm/test/CodeGen/X86/peep-test-3.ll
index 1e962169d9b08..e1bd8e7b67fbe 100644
--- a/llvm/test/CodeGen/X86/peep-test-3.ll
+++ b/llvm/test/CodeGen/X86/peep-test-3.ll
@@ -8,11 +8,11 @@ define void @or(ptr %A, i32 %IA, i32 %N) nounwind {
 ; CHECK-LABEL: or:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $3, %edx
-; CHECK-NEXT:    xorl $1, %ecx
-; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    andl $3, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    xorl $1, %edx
+; CHECK-NEXT:    orl %ecx, %edx
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %bb
 ; CHECK-NEXT:    movl $0, (%eax)
@@ -66,11 +66,11 @@ return:                                           ; preds = %entry
 define void @and(ptr %A, i32 %IA, i32 %N, ptr %p) nounwind {
 ; CHECK-LABEL: and:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl $1, %eax
 ; CHECK-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    andl $3, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movb %al, (%ecx)
 ; CHECK-NEXT:    je .LBB2_2
 ; CHECK-NEXT:  # %bb.1: # %bb
@@ -102,8 +102,8 @@ define void @test(ptr %A, i32 %IA, i32 %N, ptr %p) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movb $0, (%ecx)
+; CHECK-NEXT:    movb $0, (%eax)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl $1, %eax
 ; CHECK-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    testb $3, %al
diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index f3741dc202dc5..7a207e52bbfab 100644
--- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -271,13 +271,9 @@ f:
 define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 %bar1, i64 %baz1) nounwind {
 ; CHECK32-LABEL: test_two_live_flags:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    pushl %ebp
 ; CHECK32-NEXT:    pushl %ebx
-; CHECK32-NEXT:    pushl %edi
 ; CHECK32-NEXT:    pushl %esi
 ; CHECK32-NEXT:    pushl %eax
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -286,9 +282,9 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)
 ; CHECK32-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl %ebp, %edx
-; CHECK32-NEXT:    movl %edi, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK32-NEXT:    lock cmpxchg8b (%esi)
 ; CHECK32-NEXT:    sete %al
@@ -304,9 +300,7 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64
 ; CHECK32-NEXT:    xorl %edx, %edx
 ; CHECK32-NEXT:    addl $4, %esp
 ; CHECK32-NEXT:    popl %esi
-; CHECK32-NEXT:    popl %edi
 ; CHECK32-NEXT:    popl %ebx
-; CHECK32-NEXT:    popl %ebp
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: test_two_live_flags:
@@ -344,14 +338,14 @@ f:
 define i1 @asm_clobbering_flags(ptr %mem) nounwind {
 ; CHECK32-LABEL: asm_clobbering_flags:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    movl (%ecx), %edx
-; CHECK32-NEXT:    testl %edx, %edx
-; CHECK32-NEXT:    setg %al
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%eax), %ecx
 ; CHECK32-NEXT:    #APP
-; CHECK32-NEXT:    bsfl %edx, %edx
+; CHECK32-NEXT:    bsfl %ecx, %edx
 ; CHECK32-NEXT:    #NO_APP
-; CHECK32-NEXT:    movl %edx, (%ecx)
+; CHECK32-NEXT:    movl %edx, (%eax)
+; CHECK32-NEXT:    testl %ecx, %ecx
+; CHECK32-NEXT:    setg %al
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: asm_clobbering_flags:
diff --git a/llvm/test/CodeGen/X86/pic.ll b/llvm/test/CodeGen/X86/pic.ll
index ef2849ca0cde6..714c525cff3f8 100644
--- a/llvm/test/CodeGen/X86/pic.ll
+++ b/llvm/test/CodeGen/X86/pic.ll
@@ -1,10 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-pc-linux-gnu -relocation-model=pic -asm-verbose=false -post-RA-scheduler=false -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,CHECK-I686
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -relocation-model=pic -asm-verbose=false -post-RA-scheduler=false -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,CHECK-X32
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -relocation-model=pic -asm-verbose=false -post-RA-scheduler=false -fast-isel -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,CHECK-X32
 
- at ptr = external global ptr 
- at dst = external global i32 
- at src = external global i32 
+ at ptr = external global ptr
+ at dst = external global i32
+ at src = external global i32
 
 define void @test0() nounwind {
 entry:
@@ -12,20 +13,7 @@ entry:
     %tmp.s = load i32, ptr @src
     store i32 %tmp.s, ptr @dst
     ret void
-    
-; CHECK-LABEL:	test0:
-; CHECK-I686:	calll	.L0$pb
-; CHECK-I686-NEXT:	.L0$pb:
-; CHECK-I686-NEXT:	popl
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L0$pb),
-; CHECK-I686:	movl	dst at GOT(%eax),
-; CHECK-I686:	movl	ptr at GOT(%eax),
-; CHECK-I686:	movl	src at GOT(%eax),
-; CHECK-I686:	ret
-; CHECK-X32-DAG:	movl	dst at GOTPCREL(%rip),
-; CHECK-X32-DAG:	movl	ptr at GOTPCREL(%rip),
-; CHECK-X32-DAG:	movl	src at GOTPCREL(%rip),
-; CHECK-X32:	retq
+
 }
 
 @ptr2 = global ptr null
@@ -38,20 +26,7 @@ entry:
     %tmp.s = load i32, ptr @src2
     store i32 %tmp.s, ptr @dst2
     ret void
-    
-; CHECK-LABEL:	test1:
-; CHECK-I686:	calll	.L1$pb
-; CHECK-I686-NEXT:	.L1$pb:
-; CHECK-I686-NEXT:	popl
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L1$pb), %eax
-; CHECK-I686:	movl	dst2 at GOT(%eax),
-; CHECK-I686:	movl	ptr2 at GOT(%eax),
-; CHECK-I686:	movl	src2 at GOT(%eax),
-; CHECK-I686:	ret
-; CHECK-X32-DAG:	movl	dst2 at GOTPCREL(%rip),
-; CHECK-X32-DAG:	movl	ptr2 at GOTPCREL(%rip),
-; CHECK-X32-DAG:	movl	src2 at GOTPCREL(%rip),
-; CHECK-X32:	retq
+
 
 }
 
@@ -61,27 +36,10 @@ define void @test2() nounwind {
 entry:
     %ptr = call ptr @malloc(i32 40)
     ret void
-; CHECK-LABEL:	test2:
-; CHECK-I686:	pushl	%ebx
-; CHECK-I686-NEXT:	subl	$8, %esp
-; CHECK-I686-NEXT:	calll	.L2$pb
-; CHECK-I686-NEXT:	.L2$pb:
-; CHECK-I686-NEXT:	popl	%ebx
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L2$pb), %ebx
-; CHECK-I686:	movl	$40, (%esp)
-; CHECK-I686:	calll	malloc at PLT
-; CHECK-I686:	addl	$8, %esp
-; CHECK-I686:	popl	%ebx
-; CHECK-I686:	ret
-; CHECK-X32:	pushq	%rax
-; CHECK-X32:	movl	$40, %edi
-; CHECK-X32:	callq	malloc at PLT
-; CHECK-X32:	popq	%rax
-; CHECK-X32:	retq
 
 }
 
- at pfoo = external global ptr 
+ at pfoo = external global ptr
 
 define void @test3() nounwind {
 entry:
@@ -90,17 +48,6 @@ entry:
     %tmp1 = load ptr, ptr @pfoo
     call void(...) %tmp1()
     ret void
-; CHECK-LABEL:	test3:
-; CHECK-I686:	calll	.L3$pb
-; CHECK-I686-NEXT:	.L3$pb:
-; CHECK-I686:	popl
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L3$pb), %[[REG3:e..]]
-; CHECK-I686:	calll	afoo at PLT
-; CHECK-I686:	movl	pfoo at GOT(%[[REG3]]),
-; CHECK-I686:	calll	*
-; CHECK-X32:	callq	afoo at PLT
-; CHECK-X32:	movl	pfoo at GOTPCREL(%rip),
-; CHECK-X32:	callq	*
 }
 
 declare ptr @afoo(...)
@@ -109,12 +56,6 @@ define void @test4() nounwind {
 entry:
     call void(...) @foo()
     ret void
-; CHECK-LABEL:	test4:
-; CHECK-I686:	calll	.L4$pb
-; CHECK-I686:	popl	%ebx
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L4$pb), %ebx
-; CHECK-I686:	calll	foo at PLT
-; CHECK-X32:	callq	foo at PLT
 
 }
 
@@ -131,22 +72,7 @@ entry:
     %tmp.s = load i32, ptr @src6
     store i32 %tmp.s, ptr @dst6
     ret void
-    
-; CHECK-LABEL:	test5:
-; CHECK-I686:	calll	.L5$pb
-; CHECK-I686-NEXT:	.L5$pb:
-; CHECK-I686-NEXT:	popl	%eax
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L5$pb), %eax
-; CHECK-I686:	leal	dst6 at GOTOFF(%eax), %ecx
-; CHECK-I686:	movl	%ecx, ptr6 at GOTOFF(%eax)
-; CHECK-I686:	movl	src6 at GOTOFF(%eax), %ecx
-; CHECK-I686:	movl	%ecx, dst6 at GOTOFF(%eax)
-; CHECK-I686:	ret
-; CHECK-X32:	leal	dst6(%rip), %eax
-; CHECK-X32:	movl	%eax, ptr6(%rip)
-; CHECK-X32:	movl	src6(%rip), %eax
-; CHECK-X32:	movl	%eax, dst6(%rip)
-; CHECK-X32:	retq
+
 }
 
 
@@ -157,14 +83,7 @@ entry:
     %retval = select i1 %tmp, double 4.561230e+02, double 1.234560e+02
     ret double %retval
 
-; CHECK:	.LCPI6_0:
 
-; CHECK-LABEL:	test6:
-; CHECK-I686:	calll .L6$pb
-; CHECK-I686:	.L6$pb:
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L6$pb), 
-; CHECK-I686:	fldl	.LCPI6_0 at GOTOFF(
-; CHECK-X32:		.LCPI6_0(%rip),
 }
 
 
@@ -211,39 +130,8 @@ bb11:
 bb12:
     tail call void(...) @foo6()
     ret void
-    
-; CHECK-LABEL:	test7:
-; CHECK-I686:	calll	.L7$pb
-; CHECK-I686:	.L7$pb:
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L7$pb),
-; CHECK-I686:	.LJTI7_0 at GOTOFF(
-; CHECK-I686:	jmpl	*
-; CHECK-X32:	leal	.LJTI7_0(%rip), %eax
-; CHECK-X32:	addl	(%eax,%edi,4), %eax
-; CHECK-X32:	jmpq	*%rax
-
-; CHECK:	.p2align 2
-; CHECK-NEXT:	.LJTI7_0:
-; CHECK-I686:	.long	 .LBB7_2 at GOTOFF
-; CHECK-I686:	.long	 .LBB7_8 at GOTOFF
-; CHECK-I686:	.long	 .LBB7_4 at GOTOFF
-; CHECK-I686:	.long	 .LBB7_6 at GOTOFF
-; CHECK-I686:	.long	 .LBB7_5 at GOTOFF
-; CHECK-I686:	.long	 .LBB7_8 at GOTOFF
-; CHECK-I686:	.long	 .LBB7_7 at GOTOFF
-; CHECK-X32:	.long	.LBB7_2-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_2-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_5-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_9-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_5-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_8-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_9-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_8-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_3-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_3-.LJTI7_0
+
+
 }
 
 declare void @foo1(...)
@@ -274,26 +162,6 @@ entry:
     store i32 %tmp.s, ptr @tlsdstgd
     ret void
 
-; CHECK-LABEL:	test8:
-; CHECK-I686:	calll	.L8$pb
-; CHECK-I686-NEXT:	.L8$pb:
-; CHECK-I686-NEXT:	popl
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L8$pb), %ebx
-; CHECK-I686-DAG:	leal	tlsdstgd at TLSGD(,%ebx), %eax
-; CHECK-I686-DAG:	calll	___tls_get_addr at PLT
-; CHECK-I686-DAG:	leal	tlsptrgd at TLSGD(,%ebx), %eax
-; CHECK-I686-DAG:	calll	___tls_get_addr at PLT
-; CHECK-I686-DAG:	leal	tlssrcgd at TLSGD(,%ebx), %eax
-; CHECK-I686-DAG:	calll	___tls_get_addr at PLT
-; CHECK-X32-NOT:	data16
-; CHECK-X32-DAG:	leaq	tlsdstgd at TLSGD(%rip), %rdi
-; CHECK-X32-DAG:	callq	__tls_get_addr at PLT
-; CHECK-X32-DAG:	leaq	tlsptrgd at TLSGD(%rip), %rdi
-; CHECK-X32-DAG:	callq	__tls_get_addr at PLT
-; CHECK-X32-DAG:	leaq	tlssrcgd at TLSGD(%rip), %rdi
-; CHECK-X32-DAG:	callq	__tls_get_addr at PLT
-; CHECK-I686:	ret
-; CHECK-X32:	retq
 }
 
 define void @test9() nounwind {
@@ -303,21 +171,6 @@ entry:
     store i32 %tmp.s, ptr @tlsdstld
     ret void
 
-; CHECK-LABEL:	test9:
-; CHECK-I686:	calll	.L9$pb
-; CHECK-I686-NEXT:	.L9$pb:
-; CHECK-I686-NEXT:	popl
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L9$pb), %ebx
-; CHECK-I686:	leal	tlsdstld at TLSLDM(%ebx), %eax
-; CHECK-X32:	leaq	tlsdstld at TLSLD(%rip), %rdi
-; CHECK-I686:	calll	___tls_get_addr at PLT
-; CHECK-X32:	callq	__tls_get_addr at PLT
-; CHECK:	leal	tlsdstld at DTPOFF(
-; CHECK:	movl	{{%.*}}, tlsptrld at DTPOFF(
-; CHECK:	movl	tlssrcld at DTPOFF(
-; CHECK:	movl	{{%.*}}, tlsdstld at DTPOFF(
-; CHECK-I686:	ret
-; CHECK-X32:	retq
 }
 
 define void @test10() nounwind {
@@ -327,29 +180,6 @@ entry:
     store i32 %tmp.s, ptr @tlsdstie
     ret void
 
-; CHECK-LABEL:	test10:
-; CHECK-I686:	calll	.L10$pb
-; CHECK-I686-NEXT:	.L10$pb:
-; CHECK-I686-NEXT:	popl
-; CHECK-I686:	addl	$_GLOBAL_OFFSET_TABLE_+(.L{{.*}}-.L10$pb),
-; CHECK-I686-DAG:	movl	tlsdstie at GOTNTPOFF(
-; CHECK-I686-DAG:	movl	%gs:0,
-; CHECK-X32-DAG:	movl	tlsdstie at GOTTPOFF(%rip),
-; CHECK-X32-DAG:	movl	%fs:0,
-; CHECK-I686:	addl
-; CHECK-X32:	leal	({{%.*,%.*}}),
-; CHECK-I686:	movl	tlsptrie at GOTNTPOFF(
-; CHECK-X32:	movl	tlsptrie at GOTTPOFF(%rip),
-; CHECK-I686:	movl	{{%.*}}, %gs:(
-; CHECK-X32:	movl	{{%.*}}, ({{%.*,%.*}})
-; CHECK-I686:	movl	tlssrcie at GOTNTPOFF(
-; CHECK-X32:	movl	tlssrcie at GOTTPOFF(%rip),
-; CHECK-I686:	movl	%gs:(
-; CHECK-X32:	movl	({{%.*,%.*}}),
-; CHECK-I686:	movl	{{%.*}}, %gs:(
-; CHECK-X32:	movl	{{%.*}}, ({{%.*,%.*}})
-; CHECK-I686:	ret
-; CHECK-X32:	retq
 }
 
 define void @test11() nounwind {
@@ -359,17 +189,8 @@ entry:
     store i32 %tmp.s, ptr @tlsdstle
     ret void
 
-; CHECK-LABEL:	test11:
-; CHECK-I686:	movl	%gs:0,
-; CHECK-X32:	movl	%fs:0,
-; CHECK-I686:	leal	tlsdstle at NTPOFF(
-; CHECK-X32:	leal	tlsdstle at TPOFF(
-; CHECK-I686:	movl	{{%.*}}, %gs:tlsptrle at NTPOFF
-; CHECK-X32:	movl	{{%.*}}, %fs:tlsptrle at TPOFF
-; CHECK-I686:	movl	%gs:tlssrcle at NTPOFF,
-; CHECK-X32:	movl	%fs:tlssrcle at TPOFF,
-; CHECK-I686:	movl	{{%.*}}, %gs:tlsdstle at NTPOFF
-; CHECK-X32:	movl	{{%.*}}, %fs:tlsdstle at TPOFF
-; CHECK-I686:	ret
-; CHECK-X32:	retq
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; CHECK-I686: {{.*}}
+; CHECK-X32: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pmovsx-inreg.ll b/llvm/test/CodeGen/X86/pmovsx-inreg.ll
index a39ea60331a5e..f6be3fa88698c 100644
--- a/llvm/test/CodeGen/X86/pmovsx-inreg.ll
+++ b/llvm/test/CodeGen/X86/pmovsx-inreg.ll
@@ -27,10 +27,10 @@ define void @test1(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test1:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxbq (%ecx), %xmm0
+; X86-AVX2-NEXT:    vpmovsxbq (%eax), %xmm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %xmm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %xmm0, (%eax)
 ; X86-AVX2-NEXT:    retl
   %wide.load35 = load <2 x i8>, ptr %in, align 1
@@ -74,10 +74,10 @@ define void @test2(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test2:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxbq (%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxbq (%eax), %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %ymm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
@@ -108,10 +108,10 @@ define void @test3(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test3:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxbd (%ecx), %xmm0
+; X86-AVX2-NEXT:    vpmovsxbd (%eax), %xmm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %xmm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %xmm0, (%eax)
 ; X86-AVX2-NEXT:    retl
   %wide.load35 = load <4 x i8>, ptr %in, align 1
@@ -155,10 +155,10 @@ define void @test4(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test4:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxbd (%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxbd (%eax), %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %ymm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
@@ -189,10 +189,10 @@ define void @test5(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test5:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxbw (%ecx), %xmm0
+; X86-AVX2-NEXT:    vpmovsxbw (%eax), %xmm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %xmm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %xmm0, (%eax)
 ; X86-AVX2-NEXT:    retl
   %wide.load35 = load <8 x i8>, ptr %in, align 1
@@ -236,10 +236,10 @@ define void @test6(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test6:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxbw (%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxbw (%eax), %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %ymm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
@@ -270,10 +270,10 @@ define void @test7(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test7:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxwq (%ecx), %xmm0
+; X86-AVX2-NEXT:    vpmovsxwq (%eax), %xmm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %xmm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %xmm0, (%eax)
 ; X86-AVX2-NEXT:    retl
   %wide.load35 = load <2 x i16>, ptr %in, align 1
@@ -317,10 +317,10 @@ define void @test8(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test8:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxwq (%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxwq (%eax), %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %ymm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
@@ -351,10 +351,10 @@ define void @test9(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test9:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxwd (%ecx), %xmm0
+; X86-AVX2-NEXT:    vpmovsxwd (%eax), %xmm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %xmm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %xmm0, (%eax)
 ; X86-AVX2-NEXT:    retl
   %wide.load35 = load <4 x i16>, ptr %in, align 1
@@ -398,10 +398,10 @@ define void @test10(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test10:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxwd (%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxwd (%eax), %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %ymm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
@@ -432,10 +432,10 @@ define void @test11(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test11:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxdq (%ecx), %xmm0
+; X86-AVX2-NEXT:    vpmovsxdq (%eax), %xmm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %xmm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %xmm0, (%eax)
 ; X86-AVX2-NEXT:    retl
   %wide.load35 = load <2 x i32>, ptr %in, align 1
@@ -479,10 +479,10 @@ define void @test12(ptr %in, ptr %out) nounwind {
 ; X86-AVX2-LABEL: test12:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vpmovsxdq (%ecx), %ymm0
+; X86-AVX2-NEXT:    vpmovsxdq (%eax), %ymm0
 ; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vmovups %ymm1, (%eax)
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pointer-vector.ll b/llvm/test/CodeGen/X86/pointer-vector.ll
index aa9b977482fc1..71f147cafeba6 100644
--- a/llvm/test/CodeGen/X86/pointer-vector.ll
+++ b/llvm/test/CodeGen/X86/pointer-vector.ll
@@ -129,8 +129,8 @@ define <4 x i32> @ICMP0(ptr %p0, ptr %p1) nounwind {
 ; CHECK-LABEL: ICMP0:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movdqa (%ecx), %xmm0
+; CHECK-NEXT:    movdqa (%eax), %xmm0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    pcmpgtd (%eax), %xmm0
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [9,8,7,6]
 ; CHECK-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
@@ -148,8 +148,8 @@ define <4 x i32> @ICMP1(ptr %p0, ptr %p1) nounwind {
 ; CHECK-LABEL: ICMP1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movdqa (%ecx), %xmm0
+; CHECK-NEXT:    movdqa (%eax), %xmm0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    pcmpeqd (%eax), %xmm0
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [9,8,7,6]
 ; CHECK-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 3004b8b72fcc5..8334ce815f11a 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -342,84 +342,83 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
-; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
-; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
-; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
-; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl 32(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 36(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edx
 ; X86-NOSSE-NEXT:    shrl %edx
 ; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    subl %edx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edx
 ; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    shrl $4, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %esi
-; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ecx, %edi
-; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %edx
 ; X86-NOSSE-NEXT:    shrl %edx
 ; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    subl %edx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %edx
 ; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    shrl $4, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %esi, %eax
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %eax
 ; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %esi, %eax
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    shrl $4, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    addl %ecx, %eax
+; X86-NOSSE-NEXT:    movl 24(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %edx, %esi
 ; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %ecx
+; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 4(%eax)
-; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
+; X86-NOSSE-NEXT:    leal -4(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
@@ -467,23 +466,21 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-POPCNT:       # %bb.0:
 ; X86-POPCNT-NEXT:    pushl %ebp
 ; X86-POPCNT-NEXT:    movl %esp, %ebp
-; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    andl $-16, %esp
 ; X86-POPCNT-NEXT:    subl $16, %esp
-; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
-; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
-; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %ecx
+; X86-POPCNT-NEXT:    addl %eax, %ecx
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %edx
+; X86-POPCNT-NEXT:    addl %eax, %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
-; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
-; X86-POPCNT-NEXT:    addl %ecx, %esi
-; X86-POPCNT-NEXT:    addl %edx, %esi
-; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    movl %edx, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
-; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
-; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    movl %ebp, %esp
 ; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
@@ -535,7 +532,6 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -555,7 +551,7 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
 ; X86-SSE2-NEXT:    psadbw %xmm3, %xmm4
-; X86-SSE2-NEXT:    movd %xmm4, %ecx
+; X86-SSE2-NEXT:    movd %xmm4, %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; X86-SSE2-NEXT:    psrlw $1, %xmm5
@@ -571,9 +567,10 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    paddb %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psadbw %xmm3, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %edx
-; X86-SSE2-NEXT:    addl %ecx, %edx
-; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    addl %eax, %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
 ; X86-SSE2-NEXT:    movl $0, 8(%eax)
 ; X86-SSE2-NEXT:    movl $0, 4(%eax)
@@ -587,7 +584,6 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    movl %esp, %ebp
 ; X86-SSSE3-NEXT:    andl $-16, %esp
 ; X86-SSSE3-NEXT:    subl $16, %esp
-; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -602,7 +598,7 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    paddb %xmm4, %xmm3
 ; X86-SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm3
-; X86-SSSE3-NEXT:    movd %xmm3, %ecx
+; X86-SSSE3-NEXT:    movd %xmm3, %eax
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSSE3-NEXT:    pand %xmm1, %xmm4
@@ -613,9 +609,10 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    pshufb %xmm3, %xmm0
 ; X86-SSSE3-NEXT:    paddb %xmm5, %xmm0
 ; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm0
-; X86-SSSE3-NEXT:    movd %xmm0, %edx
-; X86-SSSE3-NEXT:    addl %ecx, %edx
-; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movd %xmm0, %ecx
+; X86-SSSE3-NEXT:    addl %eax, %ecx
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
+; X86-SSSE3-NEXT:    movl %ecx, (%eax)
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 8(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 4(%eax)
@@ -955,85 +952,86 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
 ; X86-NOSSE-NEXT:    subl $16, %esp
-; X86-NOSSE-NEXT:    movl 32(%ebp), %edx
-; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    movl 8(%ebp), %ecx
+; X86-NOSSE-NEXT:    xorl %eax, %eax
+; X86-NOSSE-NEXT:    movl %eax, 12(%ecx)
+; X86-NOSSE-NEXT:    movl %eax, 8(%ecx)
+; X86-NOSSE-NEXT:    movl %eax, 4(%ecx)
+; X86-NOSSE-NEXT:    movl 36(%ebp), %eax
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl %ecx
+; X86-NOSSE-NEXT:    movl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %edx, %ecx
+; X86-NOSSE-NEXT:    subl %ecx, %eax
+; X86-NOSSE-NEXT:    movl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl %edx, %esi
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl %edx, %eax
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    movl 32(%ebp), %edi
+; X86-NOSSE-NEXT:    movl %edi, %eax
 ; X86-NOSSE-NEXT:    shrl %eax
 ; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
 ; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %esi, %edi
-; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %esi, %edi
-; X86-NOSSE-NEXT:    movl %edx, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    movl $1431655765, %eax # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %eax, %esi
-; X86-NOSSE-NEXT:    subl %esi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edx, %ebx
-; X86-NOSSE-NEXT:    movl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    subl %eax, %edi
+; X86-NOSSE-NEXT:    movl %edi, %eax
+; X86-NOSSE-NEXT:    andl %edx, %eax
+; X86-NOSSE-NEXT:    shrl $2, %edi
 ; X86-NOSSE-NEXT:    andl %edx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    andl %edx, %ebx
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl 28(%ebp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl %eax, %edi
-; X86-NOSSE-NEXT:    subl %edi, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %eax, %edi
+; X86-NOSSE-NEXT:    movl %edi, %eax
+; X86-NOSSE-NEXT:    shrl $4, %eax
+; X86-NOSSE-NEXT:    addl %edi, %eax
+; X86-NOSSE-NEXT:    movl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edi, %esi
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %ebx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ebx
+; X86-NOSSE-NEXT:    andl %edi, %eax
+; X86-NOSSE-NEXT:    imull $16843009, %eax, %esi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %esi
+; X86-NOSSE-NEXT:    addl %ebx, %esi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %ebx
+; X86-NOSSE-NEXT:    andl %ecx, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
-; X86-NOSSE-NEXT:    andl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl %edx, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    andl %edx, %eax
 ; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %ecx
-; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    movl %eax, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %eax, %ebx
+; X86-NOSSE-NEXT:    movl 24(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %edi, %eax
+; X86-NOSSE-NEXT:    subl %eax, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %eax
+; X86-NOSSE-NEXT:    andl %edx, %eax
+; X86-NOSSE-NEXT:    shrl $2, %ecx
+; X86-NOSSE-NEXT:    andl %edx, %ecx
 ; X86-NOSSE-NEXT:    addl %eax, %ecx
-; X86-NOSSE-NEXT:    movl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %eax, %edi
-; X86-NOSSE-NEXT:    andl %eax, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %ecx, %eax
+; X86-NOSSE-NEXT:    shrl $4, %eax
+; X86-NOSSE-NEXT:    addl %ecx, %eax
+; X86-NOSSE-NEXT:    movl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
 ; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
-; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
@@ -1089,6 +1087,10 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    andl $-16, %esp
 ; X86-POPCNT-NEXT:    subl $16, %esp
 ; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    xorl %ecx, %ecx
+; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
+; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
+; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
 ; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
@@ -1096,10 +1098,6 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
-; X86-POPCNT-NEXT:    xorl %ecx, %ecx
-; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
-; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
-; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
@@ -1155,6 +1153,10 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    xorl %ecx, %ecx
+; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
+; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1192,10 +1194,6 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    psadbw %xmm3, %xmm0
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    addl %ecx, %edx
-; X86-SSE2-NEXT:    xorl %ecx, %ecx
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -1208,6 +1206,10 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    andl $-16, %esp
 ; X86-SSSE3-NEXT:    subl $16, %esp
 ; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
+; X86-SSSE3-NEXT:    xorl %ecx, %ecx
+; X86-SSSE3-NEXT:    movl %ecx, 12(%eax)
+; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
+; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1235,10 +1237,6 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm0
 ; X86-SSSE3-NEXT:    movd %xmm0, %edx
 ; X86-SSSE3-NEXT:    addl %ecx, %edx
-; X86-SSSE3-NEXT:    xorl %ecx, %ecx
-; X86-SSSE3-NEXT:    movl %ecx, 12(%eax)
-; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
-; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
 ; X86-SSSE3-NEXT:    movl %ebp, %esp
 ; X86-SSSE3-NEXT:    popl %ebp
@@ -1464,21 +1462,28 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    andl $-16, %esp
-; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
 ; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
-; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    shrl %edx
-; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    movl 36(%ebp), %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    movl %ecx, %esi
 ; X86-NOSSE-NEXT:    shrl %esi
 ; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
@@ -1488,54 +1493,47 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ecx, %edi
-; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
-; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    shrl %edx
-; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl 24(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %edx
-; X86-NOSSE-NEXT:    shrl $4, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %esi, %eax
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %esi, %eax
-; X86-NOSSE-NEXT:    movl %eax, %esi
-; X86-NOSSE-NEXT:    shrl $4, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %esi, %edi
+; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %esi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %esi
+; X86-NOSSE-NEXT:    movl %edx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %edi
+; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %edx, %edi
+; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %esi, %edx
 ; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
 ; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
@@ -1591,6 +1589,10 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    andl $-16, %esp
 ; X86-POPCNT-NEXT:    subl $16, %esp
 ; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    xorl %ecx, %ecx
+; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
+; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
+; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
 ; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
@@ -1598,10 +1600,6 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
-; X86-POPCNT-NEXT:    xorl %ecx, %ecx
-; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
-; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
-; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
@@ -1657,6 +1655,10 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    xorl %ecx, %ecx
+; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
+; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1694,10 +1696,6 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    psadbw %xmm3, %xmm0
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
 ; X86-SSE2-NEXT:    addl %ecx, %edx
-; X86-SSE2-NEXT:    xorl %ecx, %ecx
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -1710,6 +1708,10 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    andl $-16, %esp
 ; X86-SSSE3-NEXT:    subl $16, %esp
 ; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
+; X86-SSSE3-NEXT:    xorl %ecx, %ecx
+; X86-SSSE3-NEXT:    movl %ecx, 12(%eax)
+; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
+; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1737,10 +1739,6 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    psadbw %xmm2, %xmm0
 ; X86-SSSE3-NEXT:    movd %xmm0, %edx
 ; X86-SSSE3-NEXT:    addl %ecx, %edx
-; X86-SSSE3-NEXT:    xorl %ecx, %ecx
-; X86-SSSE3-NEXT:    movl %ecx, 12(%eax)
-; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
-; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
 ; X86-SSSE3-NEXT:    movl %ebp, %esp
 ; X86-SSSE3-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/powi-const.ll b/llvm/test/CodeGen/X86/powi-const.ll
index 49f0e9a6b1455..f6980d9d5c06b 100644
--- a/llvm/test/CodeGen/X86/powi-const.ll
+++ b/llvm/test/CodeGen/X86/powi-const.ll
@@ -144,23 +144,21 @@ define <2 x float> @powi_v2f32(<2 x float> %a) nounwind minsize {
 ; X86-X87-LABEL: powi_v2f32:
 ; X86-X87:       # %bb.0:
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $16, %esp
-; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-X87-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    subl $12, %esp
 ; X86-X87-NEXT:    pushl $15
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fstps (%esp)
 ; X86-X87-NEXT:    calll __powisf2
 ; X86-X87-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-X87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fstps (%esp)
 ; X86-X87-NEXT:    calll __powisf2
 ; X86-X87-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-X87-NEXT:    fxch %st(1)
-; X86-X87-NEXT:    addl $16, %esp
+; X86-X87-NEXT:    addl $12, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll
index 9446cc071c1fc..16b5093315516 100644
--- a/llvm/test/CodeGen/X86/powi.ll
+++ b/llvm/test/CodeGen/X86/powi.ll
@@ -21,9 +21,9 @@ define float @test_powi_f32_i32(float %Val, i32 %x) nounwind {
 ; SDAG-X86-LABEL: test_powi_f32_i32:
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    subl $12, %esp
-; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstps (%esp)
 ; SDAG-X86-NEXT:    calll __powisf2
 ; SDAG-X86-NEXT:    addl $12, %esp
@@ -76,9 +76,9 @@ define double @test_powi_f64_i32(double %Val, i32 %x) nounwind {
 ; SDAG-X86-LABEL: test_powi_f64_i32:
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    subl $12, %esp
-; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstpl (%esp)
 ; SDAG-X86-NEXT:    calll __powidf2
 ; SDAG-X86-NEXT:    addl $12, %esp
@@ -134,9 +134,9 @@ define x86_fp80 @test_powi_f80_i32(x86_fp80 %Val, i32 %x) nounwind {
 ; SDAG-X86-LABEL: test_powi_f80_i32:
 ; SDAG-X86:       # %bb.0:
 ; SDAG-X86-NEXT:    subl $28, %esp
-; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SDAG-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; SDAG-X86-NEXT:    fstpt (%esp)
 ; SDAG-X86-NEXT:    calll __powixf2
 ; SDAG-X86-NEXT:    addl $28, %esp
diff --git a/llvm/test/CodeGen/X86/pr111170.ll b/llvm/test/CodeGen/X86/pr111170.ll
index 145bf7119edcb..64e5906d2375d 100644
--- a/llvm/test/CodeGen/X86/pr111170.ll
+++ b/llvm/test/CodeGen/X86/pr111170.ll
@@ -4,18 +4,18 @@
 define void @PR111170(<16 x i32> %x_load, ptr %offsetsPtr.i) {
 ; CHECK-LABEL: PR111170:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2.80259693E-44,2.80259693E-44,2.80259693E-44,2.80259693E-44]
+; CHECK-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2.80259693E-44,2.80259693E-44,2.80259693E-44,2.80259693E-44]
-; CHECK-NEXT:    vpmulld %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqu %xmm0, 16(%eax)
-; CHECK-NEXT:    vmovdqu %xmm4, (%eax)
-; CHECK-NEXT:    vmovdqu %xmm1, 48(%eax)
-; CHECK-NEXT:    vmovdqu %xmm3, 32(%eax)
+; CHECK-NEXT:    vmovdqu %xmm2, 16(%eax)
+; CHECK-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqu %xmm0, (%eax)
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; CHECK-NEXT:    vpmulld %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vmovdqu %xmm0, 48(%eax)
+; CHECK-NEXT:    vpmulld %xmm3, %xmm1, %xmm0
+; CHECK-NEXT:    vmovdqu %xmm0, 32(%eax)
 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqu %xmm0, 16
 ; CHECK-NEXT:    vmovdqu %xmm0, 0
diff --git a/llvm/test/CodeGen/X86/pr118934.ll b/llvm/test/CodeGen/X86/pr118934.ll
index b68ed2643d8df..ea38fc8b5c850 100644
--- a/llvm/test/CodeGen/X86/pr118934.ll
+++ b/llvm/test/CodeGen/X86/pr118934.ll
@@ -5,18 +5,18 @@
 define void @PR118934(i1 %b, ptr %f, ptr %k) {
 ; X86-LABEL: PR118934:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    addb $-2, %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andb $1, %dl
-; X86-NEXT:    addb %dl, %dl
-; X86-NEXT:    addb $-2, %dl
-; X86-NEXT:    movsbl %dl, %edx
-; X86-NEXT:    addl $-6, %edx
-; X86-NEXT:    addl $6, %edx
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    addl $-6, %ecx
+; X86-NEXT:    addl $6, %ecx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    addl %ecx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ;
 ; X64-LABEL: PR118934:
 ; X64:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/pr134607.ll b/llvm/test/CodeGen/X86/pr134607.ll
index 2b383fc964298..56d721a716da6 100644
--- a/llvm/test/CodeGen/X86/pr134607.ll
+++ b/llvm/test/CodeGen/X86/pr134607.ll
@@ -6,8 +6,8 @@
 define void @store_v2f32_constant(ptr %v) {
 ; X86-LABEL: store_v2f32_constant:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps {{.*#+}} xmm0 = [2.56E+2,5.12E+2,u,u]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/pr14314.ll b/llvm/test/CodeGen/X86/pr14314.ll
index bb1608e3cd9b2..07af9fd7fe01c 100644
--- a/llvm/test/CodeGen/X86/pr14314.ll
+++ b/llvm/test/CodeGen/X86/pr14314.ll
@@ -9,18 +9,18 @@ define i64 @atomicSub(ptr %a, i64 %b) nounwind {
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl (%esi), %eax
+; CHECK-NEXT:    movl 4(%esi), %edx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT:    movl (%ebp), %eax
-; CHECK-NEXT:    movl 4(%ebp), %edx
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %eax, %ebx
-; CHECK-NEXT:    subl %edi, %ebx
+; CHECK-NEXT:    subl %ebp, %ebx
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    sbbl %esi, %ecx
-; CHECK-NEXT:    lock cmpxchg8b (%ebp)
+; CHECK-NEXT:    sbbl %edi, %ecx
+; CHECK-NEXT:    lock cmpxchg8b (%esi)
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
 ; CHECK-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/pr147781.ll b/llvm/test/CodeGen/X86/pr147781.ll
index 9f65fd5984e7b..7757df2307514 100644
--- a/llvm/test/CodeGen/X86/pr147781.ll
+++ b/llvm/test/CodeGen/X86/pr147781.ll
@@ -5,10 +5,10 @@
 define void @test(ptr %p, ptr %p2) #0 {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd %xmm0, (%eax)
+; CHECK-NEXT:    movsd %xmm0, (%ecx)
 ; CHECK-NEXT:    retl
   %val = load i64, ptr %p, align 8, !range !0
   store i64 %val, ptr %p2, align 8
diff --git a/llvm/test/CodeGen/X86/pr15309.ll b/llvm/test/CodeGen/X86/pr15309.ll
index da8f68ed637ae..5048360668882 100644
--- a/llvm/test/CodeGen/X86/pr15309.ll
+++ b/llvm/test/CodeGen/X86/pr15309.ll
@@ -4,30 +4,26 @@
 define void @test_convert_float2_ulong2(ptr nocapture %src, ptr nocapture %dest) nounwind {
 ; CHECK-LABEL: test_convert_float2_ulong2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushl %edi
-; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $20, %esp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 168(%ecx), %edx
-; CHECK-NEXT:    movl 172(%ecx), %esi
-; CHECK-NEXT:    movl 160(%ecx), %edi
-; CHECK-NEXT:    movl 164(%ecx), %ecx
+; CHECK-NEXT:    movl 172(%eax), %ecx
 ; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %edi, (%esp)
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl 168(%eax), %edx
 ; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    shrl $31, %ecx
-; CHECK-NEXT:    fildll (%esp)
-; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%ecx,4)
-; CHECK-NEXT:    shrl $31, %esi
 ; CHECK-NEXT:    fildll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%esi,4)
-; CHECK-NEXT:    fstps 84(%eax)
-; CHECK-NEXT:    fstps 80(%eax)
+; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%ecx,4)
+; CHECK-NEXT:    movl 160(%eax), %ecx
+; CHECK-NEXT:    movl 164(%eax), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    fstps 84(%edx)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %ecx, (%esp)
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    fildll (%esp)
+; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; CHECK-NEXT:    fstps 80(%edx)
 ; CHECK-NEXT:    addl $20, %esp
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
   %t0 = getelementptr <2 x i64>, ptr %src, i32 10
   %t1 = load <2 x i64>, ptr %t0, align 16
diff --git a/llvm/test/CodeGen/X86/pr179489.ll b/llvm/test/CodeGen/X86/pr179489.ll
index b7308b77669e7..0d8229fe7745d 100644
--- a/llvm/test/CodeGen/X86/pr179489.ll
+++ b/llvm/test/CodeGen/X86/pr179489.ll
@@ -8,9 +8,9 @@ declare void @llvm.masked.store.v8i8.p1(<8 x i8>, ptr addrspace(1) captures(none
 define void @foo(<8 x i16> %arg, ptr addrspace(1) %add.ptr) {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vmovw %xmm0, %ecx
-; X86-NEXT:    movb %ch, (%eax)
+; X86-NEXT:    vmovw %xmm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb %ah, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: foo:
@@ -29,12 +29,12 @@ entry:
 define void @bar(<8 x i64> %arg, ptr addrspace(1) %add.ptr) {
 ; X86-LABEL: bar:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [257,0,257,0,257,0,257,0,257,0,257,0,257,0,257,0]
 ; X86-NEXT:    movb $2, %al
 ; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [257,0,257,0,257,0,257,0,257,0,257,0,257,0,257,0]
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsrlq $16, %zmm1, %zmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpmovqb %zmm0, (%eax) {%k1}
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr18344.ll b/llvm/test/CodeGen/X86/pr18344.ll
index 75a55e6a4bf5e..0560e9772cdce 100644
--- a/llvm/test/CodeGen/X86/pr18344.ll
+++ b/llvm/test/CodeGen/X86/pr18344.ll
@@ -7,29 +7,23 @@
 define void @FFT(ptr noalias nocapture %destination, ptr noalias %re, ptr noalias nocapture %ptr_cast_for_load) nounwind {
 ; X86-LABEL: FFT:
 ; X86:       # %bb.0: # %begin
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqu (%edx), %xmm0
+; X86-NEXT:    movdqu (%eax), %xmm0
 ; X86-NEXT:    pslld $4, %xmm0
-; X86-NEXT:    movd %xmm0, %edx
-; X86-NEXT:    pextrd $1, %xmm0, %esi
-; X86-NEXT:    pextrd $2, %xmm0, %edi
-; X86-NEXT:    pextrd $3, %xmm0, %ebx
-; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    pextrd $1, %xmm0, %eax
 ; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    pextrd $2, %xmm0, %eax
 ; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; X86-NEXT:    movss %xmm0, 128(%eax)
-; X86-NEXT:    movss %xmm1, 164(%eax)
-; X86-NEXT:    movss %xmm2, 200(%eax)
-; X86-NEXT:    movss %xmm3, 236(%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    pextrd $3, %xmm0, %eax
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movss %xmm0, 236(%eax)
+; X86-NEXT:    movss %xmm3, 200(%eax)
+; X86-NEXT:    movss %xmm2, 164(%eax)
+; X86-NEXT:    movss %xmm1, 128(%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: FFT:
diff --git a/llvm/test/CodeGen/X86/pr20011.ll b/llvm/test/CodeGen/X86/pr20011.ll
index 4810226b4a756..bf6f2b7d8fca4 100644
--- a/llvm/test/CodeGen/X86/pr20011.ll
+++ b/llvm/test/CodeGen/X86/pr20011.ll
@@ -7,13 +7,13 @@
 define void @crash(i64 %x0, i64 %y0, ptr nocapture %dest) nounwind {
 ; X86-LABEL: crash:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shlb $2, %al
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shlb $2, %dl
 ; X86-NEXT:    andb $3, %cl
-; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    orb %al, %cl
 ; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/pr22338.ll b/llvm/test/CodeGen/X86/pr22338.ll
index bf3967ca8fede..61d9dbaab715f 100644
--- a/llvm/test/CodeGen/X86/pr22338.ll
+++ b/llvm/test/CodeGen/X86/pr22338.ll
@@ -8,19 +8,19 @@ define i32 @fn(i32 %a0, i32 %a1) {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %ebx, -8
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    setne %al
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sete %dl
-; X86-NEXT:    negl %eax
-; X86-NEXT:    addb %cl, %cl
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    setne %bl
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    addb %dl, %dl
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %bb1
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/pr22970.ll b/llvm/test/CodeGen/X86/pr22970.ll
index 28aea0f2af58c..4e9efd87b47e1 100644
--- a/llvm/test/CodeGen/X86/pr22970.ll
+++ b/llvm/test/CodeGen/X86/pr22970.ll
@@ -5,10 +5,10 @@
 define i32 @PR22970_i32(ptr nocapture readonly, i32) {
 ; X86-LABEL: PR22970_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $4095, %ecx # imm = 0xFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 32(%eax,%ecx,4), %eax
+; X86-NEXT:    movl $4095, %eax # imm = 0xFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR22970_i32:
@@ -28,10 +28,10 @@ define i32 @PR22970_i32(ptr nocapture readonly, i32) {
 define i32 @PR22970_i64(ptr nocapture readonly, i64) {
 ; X86-LABEL: PR22970_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $4095, %ecx # imm = 0xFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 32(%eax,%ecx,4), %eax
+; X86-NEXT:    movl $4095, %eax # imm = 0xFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR22970_i64:
diff --git a/llvm/test/CodeGen/X86/pr25725.ll b/llvm/test/CodeGen/X86/pr25725.ll
index 1b164139cc460..b16f021925de1 100644
--- a/llvm/test/CodeGen/X86/pr25725.ll
+++ b/llvm/test/CodeGen/X86/pr25725.ll
@@ -17,8 +17,8 @@ define void @f(ptr nocapture %s) #0 {
 ;
 ; X86-LABEL: f:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movups %xmm0, 48(%eax)
 ; X86-NEXT:    movups %xmm0, 32(%eax)
 ; X86-NEXT:    movups %xmm0, 16(%eax)
diff --git a/llvm/test/CodeGen/X86/pr2656.ll b/llvm/test/CodeGen/X86/pr2656.ll
index d3f153be19293..6c35e15f4413f 100644
--- a/llvm/test/CodeGen/X86/pr2656.ll
+++ b/llvm/test/CodeGen/X86/pr2656.ll
@@ -20,13 +20,13 @@ define void @foo(ptr byval(%struct.anon) %p) nounwind {
 ; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    xorps %xmm0, %xmm1
-; CHECK-NEXT:    cvtss2sd %xmm1, %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm2
+; CHECK-NEXT:    cvtss2sd %xmm2, %xmm2
+; CHECK-NEXT:    movsd %xmm2, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    xorps %xmm0, %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    cvtss2sd %xmm2, %xmm0
+; CHECK-NEXT:    cvtss2sd %xmm1, %xmm0
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movsd %xmm1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $_.str, (%esp)
 ; CHECK-NEXT:    calll _printf
 ; CHECK-NEXT:    addl $28, %esp
diff --git a/llvm/test/CodeGen/X86/pr28824.ll b/llvm/test/CodeGen/X86/pr28824.ll
index 274689527a243..0c59d12ebcbf0 100644
--- a/llvm/test/CodeGen/X86/pr28824.ll
+++ b/llvm/test/CodeGen/X86/pr28824.ll
@@ -11,11 +11,11 @@ define i32 @fn4(i32 %i) #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $8, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movzbl d, %ecx
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    sarl %cl, %esi
-; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    movl $2, %ecx
 ; CHECK-NEXT:    movl $5, %edx
 ; CHECK-NEXT:    pushl %eax
diff --git a/llvm/test/CodeGen/X86/pr29170.ll b/llvm/test/CodeGen/X86/pr29170.ll
index a27238a8e4f18..c396a19c0a896 100644
--- a/llvm/test/CodeGen/X86/pr29170.ll
+++ b/llvm/test/CodeGen/X86/pr29170.ll
@@ -14,8 +14,8 @@ define i32 @main() {
 ; CHECK-NEXT:    jne .LBB0_3
 ; CHECK-NEXT:  # %bb.1: # %go
 ; CHECK-NEXT:    movl $-1, %ecx
-; CHECK-NEXT:    movsbl b, %edx
 ; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    movsbl b, %edx
 ; CHECK-NEXT:    movzwl %dx, %edx
 ; CHECK-NEXT:    cmpl $-1, %edx
 ; CHECK-NEXT:    sbbl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/pr30284.ll b/llvm/test/CodeGen/X86/pr30284.ll
index 708f0f7ee72da..33855adbe9135 100644
--- a/llvm/test/CodeGen/X86/pr30284.ll
+++ b/llvm/test/CodeGen/X86/pr30284.ll
@@ -16,15 +16,15 @@ define void @undef_cond() {
 define void @f_f___un_3C_unf_3E_un_3C_unf_3E_(<16 x i1> %x) {
 ; CHECK-LABEL: f_f___un_3C_unf_3E_un_3C_unf_3E_:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
+; CHECK-NEXT:    vporq 0, %zmm1, %zmm2
 ; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; CHECK-NEXT:    vpslld $31, %zmm0, %zmm0
 ; CHECK-NEXT:    vpmovd2m %zmm0, %k1
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
-; CHECK-NEXT:    vporq 64, %zmm0, %zmm1
-; CHECK-NEXT:    vporq 0, %zmm0, %zmm0
-; CHECK-NEXT:    kshiftrw $8, %k1, %k2
-; CHECK-NEXT:    vmovdqa64 %zmm0, 0 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, 64 {%k2}
+; CHECK-NEXT:    vmovdqa64 %zmm2, 0 {%k1}
+; CHECK-NEXT:    vporq 64, %zmm1, %zmm0
+; CHECK-NEXT:    kshiftrw $8, %k1, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, 64 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
   %a_load22 = load <16 x i64>, ptr null, align 1
diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll
index 5ecb67ba7ffa6..ea2b36aefb5b2 100644
--- a/llvm/test/CodeGen/X86/pr31088.ll
+++ b/llvm/test/CodeGen/X86/pr31088.ll
@@ -61,14 +61,12 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
 define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; X86-LABEL: ir_fadd_v2f16:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $84, %esp
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    psrld $16, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    psrld $16, %xmm0
-; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT:    pextrw $0, %xmm1, %eax
+; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
@@ -82,6 +80,7 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    psrld $16, %xmm0
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -101,8 +100,9 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    calll __truncsfhf2
 ; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-NEXT:    addl $84, %esp
+; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    movdqa %xmm1, %xmm0
+; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ir_fadd_v2f16:
diff --git a/llvm/test/CodeGen/X86/pr32282.ll b/llvm/test/CodeGen/X86/pr32282.ll
index a5bb7316a9673..f466ca80d2a18 100644
--- a/llvm/test/CodeGen/X86/pr32282.ll
+++ b/llvm/test/CodeGen/X86/pr32282.ll
@@ -12,25 +12,29 @@
 define dso_local void @foo(i64 %x) nounwind {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl d, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    movl d+4, %ecx
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl $701685459, %ecx # imm = 0x29D2DED3
-; X86-NEXT:    andl $-566231040, %eax # imm = 0xDE400000
-; X86-NEXT:    shrdl $21, %ecx, %eax
-; X86-NEXT:    shrl $21, %ecx
-; X86-NEXT:    addl $7, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl d+4, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl $701685459, %edx # imm = 0x29D2DED3
+; X86-NEXT:    movl d, %esi
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl $-566231040, %esi # imm = 0xDE400000
+; X86-NEXT:    shrdl $21, %edx, %esi
+; X86-NEXT:    shrl $21, %edx
+; X86-NEXT:    addl $7, %esi
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    setne {{[0-9]+}}(%esp)
-; X86-NEXT:    popl %eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll
index f9b1331551283..bbd79062c4262 100644
--- a/llvm/test/CodeGen/X86/pr32284.ll
+++ b/llvm/test/CodeGen/X86/pr32284.ll
@@ -307,32 +307,33 @@ define void @f1() {
 ; X86-NEXT:    .cfi_def_cfa_offset 13
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    .cfi_offset %ebx, -8
-; X86-NEXT:    movl var_5, %eax
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl var_5, %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    xorl $208307499, %edx # imm = 0xC6A852B
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl $-2, %esi
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    setne (%esp)
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl %ecx, %esi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    sete %dl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmpl $-1, %edx
+; X86-NEXT:    sete %bl
+; X86-NEXT:    movl %ebx, _ZN8struct_210member_2_0E
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    addl $7093, %edx # imm = 0x1BB5
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    cmpl $-1, %ecx
 ; X86-NEXT:    sete %bl
-; X86-NEXT:    addl $7093, %eax # imm = 0x1BB5
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    cmpl %ebx, %edx
+; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movl %eax, var_57
 ; X86-NEXT:    movl $0, var_57+4
-; X86-NEXT:    movl %edx, _ZN8struct_210member_2_0E
 ; X86-NEXT:    movl $0, _ZN8struct_210member_2_0E+4
 ; X86-NEXT:    addl $1, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 12
@@ -768,21 +769,25 @@ define void @f3() #0 {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl var_13, %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    movl var_13, %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    notl %eax
+; X86-NEXT:    movl %eax, var_46
+; X86-NEXT:    movl var_16, %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl var_16, %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    andl %esi, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, var_46
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index d9671aa04f460..240333d237d67 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -17,52 +17,41 @@
 define void @foo() local_unnamed_addr {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movsbl var_27, %eax
-; X86-NEXT:    movzwl var_2, %ebx
-; X86-NEXT:    movl var_310, %ecx
-; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    movl $4194303, %edx # imm = 0x3FFFFF
+; X86-NEXT:    andl obj, %edx
+; X86-NEXT:    leal (%edx,%edx), %eax
+; X86-NEXT:    movsbl var_27, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movzwl var_2, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    subl %esi, %edi
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    setge var_205
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movb %al, var_218
+; X86-NEXT:    imull var_310, %ecx
 ; X86-NEXT:    addl var_24, %ecx
-; X86-NEXT:    movl $4194303, %esi # imm = 0x3FFFFF
-; X86-NEXT:    andl obj, %esi
-; X86-NEXT:    leal (%esi,%esi), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    subl %ebx, %edi
 ; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    addb $113, %cl
-; X86-NEXT:    movl $9, %ebx
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl $9, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %ebx, %ebp
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %ebx, var_50
-; X86-NEXT:    setge var_205
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    movb %dl, var_218
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    movl %esi, var_50+4
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    movl %eax, var_50
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll
index c7405e982660c..9f57ba7115d84 100644
--- a/llvm/test/CodeGen/X86/pr32345.ll
+++ b/llvm/test/CodeGen/X86/pr32345.ll
@@ -89,10 +89,9 @@ define void @foo() {
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movzbl var_27, %ecx
 ; X86-NEXT:    movzwl var_22, %eax
 ; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl var_27, %ecx
 ; X86-NEXT:    addb $30, %cl
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    shrdl %cl, %edx, %eax
@@ -101,6 +100,7 @@ define void @foo() {
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:  .LBB0_2: # %bb
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movb %dl, (%eax)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll
index 6f3602dcf8d88..764ff9f81134f 100644
--- a/llvm/test/CodeGen/X86/pr32610.ll
+++ b/llvm/test/CodeGen/X86/pr32610.ll
@@ -13,25 +13,25 @@ define void @pr32610(i32 %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    movl 8(%ebp), %edx
 ; CHECK-NEXT:    movl L_b$non_lazy_ptr, %eax
-; CHECK-NEXT:    movl (%eax), %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    cmpl %eax, %edx
-; CHECK-NEXT:    sete %cl
+; CHECK-NEXT:    movl (%eax), %ecx
+; CHECK-NEXT:    movl 8(%ebp), %edx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpl %ecx, %edx
+; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    incl %esi
 ; CHECK-NEXT:    cmpl $0, 12(%ebp)
-; CHECK-NEXT:    cmovel %esi, %ecx
-; CHECK-NEXT:    cmpl %eax, %edx
-; CHECK-NEXT:    cmovnel %esi, %ecx
-; CHECK-NEXT:    movl L_c$non_lazy_ptr, %edx
-; CHECK-NEXT:    movl %ecx, (%edx)
-; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    movl $2, %ecx
-; CHECK-NEXT:    cmovnel %eax, %ecx
-; CHECK-NEXT:    movl L_d$non_lazy_ptr, %eax
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    cmpl %ecx, %edx
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    movl $2, %edx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %edx
+; CHECK-NEXT:    movl L_d$non_lazy_ptr, %ecx
+; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movl L_c$non_lazy_ptr, %ecx
+; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr32659.ll b/llvm/test/CodeGen/X86/pr32659.ll
index c2ce0cd1be86e..9f092980e6c04 100644
--- a/llvm/test/CodeGen/X86/pr32659.ll
+++ b/llvm/test/CodeGen/X86/pr32659.ll
@@ -29,12 +29,12 @@ define void @fn2() nounwind optsize {
 ; CHECK-NEXT:    movl $48, (%esp)
 ; CHECK-NEXT:    calll putchar at PLT
 ; CHECK-NEXT:    movl h, %eax
-; CHECK-NEXT:    movl c, %ecx
-; CHECK-NEXT:    movl j, %edx
-; CHECK-NEXT:    movl (%edx), %edx
-; CHECK-NEXT:    movl (%edx), %edx
+; CHECK-NEXT:    movl j, %ecx
+; CHECK-NEXT:    movl (%ecx), %ecx
+; CHECK-NEXT:    movl (%ecx), %ecx
+; CHECK-NEXT:    movl c, %edx
 ; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    cmpl (%edx), %ecx
+; CHECK-NEXT:    cmpl (%ecx), %edx
 ; CHECK-NEXT:    setg %bl
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    cmpl %ebx, i
@@ -49,9 +49,9 @@ define void @fn2() nounwind optsize {
 ; CHECK-NEXT:    andl %eax, e
 ; CHECK-NEXT:    sarl $31, %eax
 ; CHECK-NEXT:    andl %eax, e+4
-; CHECK-NEXT:    decl g
 ; CHECK-NEXT:    addl $1, f
 ; CHECK-NEXT:    adcl $0, f+4
+; CHECK-NEXT:    decl g
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index 279373a7aab3f..ddfae0ac48d48 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -14,41 +14,40 @@ define void @computeJD(ptr) nounwind {
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $40, %esp
 ; CHECK-NEXT:    movl 8(%ebp), %ebx
-; CHECK-NEXT:    movl 8(%ebx), %esi
+; CHECK-NEXT:    movl 8(%ebx), %ecx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpl $3, 12(%ebx)
 ; CHECK-NEXT:    setl %al
-; CHECK-NEXT:    subl %eax, %esi
-; CHECK-NEXT:    movl $-1374389535, %ecx # imm = 0xAE147AE1
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    imull %ecx
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shrl $31, %eax
-; CHECK-NEXT:    sarl $5, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
 ; CHECK-NEXT:    movl $1374389535, %edx # imm = 0x51EB851F
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    imull %edx
 ; CHECK-NEXT:    movl %edx, %edi
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    sarl $7, %edi
 ; CHECK-NEXT:    addl %eax, %edi
-; CHECK-NEXT:    imull $36525, %esi, %eax # imm = 0x8EAD
-; CHECK-NEXT:    addl $172251900, %eax # imm = 0xA445AFC
-; CHECK-NEXT:    movl $1374389535, %edx # imm = 0x51EB851F
+; CHECK-NEXT:    movl $-1374389535, %edx # imm = 0xAE147AE1
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    imull %edx
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $5, %esi
+; CHECK-NEXT:    addl %eax, %esi
+; CHECK-NEXT:    addl 16(%ebx), %esi
+; CHECK-NEXT:    addl %edi, %esi
+; CHECK-NEXT:    imull $36525, %ecx, %eax # imm = 0x8EAD
+; CHECK-NEXT:    addl $172251900, %eax # imm = 0xA445AFC
+; CHECK-NEXT:    movl $1374389535, %ecx # imm = 0x51EB851F
+; CHECK-NEXT:    imull %ecx
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    sarl $5, %edx
 ; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    addl 16(%ebx), %ecx
-; CHECK-NEXT:    addl %edi, %ecx
-; CHECK-NEXT:    leal 257(%ecx,%edx), %eax
+; CHECK-NEXT:    leal 257(%esi,%edx), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fildl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NEXT:    fldl 28(%ebx)
 ; CHECK-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
@@ -57,27 +56,28 @@ define void @computeJD(ptr) nounwind {
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movb $1, 36(%ebx)
-; CHECK-NEXT:    imull $3600000, 20(%ebx), %ecx # imm = 0x36EE80
-; CHECK-NEXT:    imull $60000, 24(%ebx), %eax # imm = 0xEA60
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    fldl 28(%ebx)
+; CHECK-NEXT:    fildl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}
 ; CHECK-NEXT:    fmuls {{\.?LCPI[0-9]+_[0-9]+}}
 ; CHECK-NEXT:    fnstcw {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    orl $3072, %ecx # imm = 0xC00
-; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    sarl $31, %ecx
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    orl $3072, %eax # imm = 0xC00
+; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldcw {{[0-9]+}}(%esp)
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %eax, (%ebx)
-; CHECK-NEXT:    movl %ecx, 4(%ebx)
+; CHECK-NEXT:    imull $3600000, 20(%ebx), %eax # imm = 0x36EE80
+; CHECK-NEXT:    imull $60000, 24(%ebx), %ecx # imm = 0xEA60
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, (%ebx)
+; CHECK-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, 4(%ebx)
+; CHECK-NEXT:    movb $1, 36(%ebx)
 ; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/pr34088.ll b/llvm/test/CodeGen/X86/pr34088.ll
index 4462a4a5ec070..c90736ddb3a19 100644
--- a/llvm/test/CodeGen/X86/pr34088.ll
+++ b/llvm/test/CodeGen/X86/pr34088.ll
@@ -22,9 +22,9 @@ define i32 @pr34088() local_unnamed_addr {
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movaps %xmm0, (%esp)
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD
 ; CHECK-NEXT:    movaps %xmm1, (%esp)
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/pr34421.ll b/llvm/test/CodeGen/X86/pr34421.ll
index 5ae71bd905504..dcb8421cfa927 100644
--- a/llvm/test/CodeGen/X86/pr34421.ll
+++ b/llvm/test/CodeGen/X86/pr34421.ll
@@ -11,8 +11,8 @@ define void @thread_selfcounts() noimplicitfloat noredzone nounwind {
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    ud2
 ;
diff --git a/llvm/test/CodeGen/X86/pr3457.ll b/llvm/test/CodeGen/X86/pr3457.ll
index 4f11bc949b772..dd0b27fe5c1a4 100644
--- a/llvm/test/CodeGen/X86/pr3457.ll
+++ b/llvm/test/CodeGen/X86/pr3457.ll
@@ -6,24 +6,19 @@
 define void @foo(ptr nocapture %P) nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $24, %esp
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    calll _test
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd %xmm0, (%esp) ## 8-byte Spill
 ; CHECK-NEXT:    calll _test
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd (%esp), %xmm1 ## 8-byte Reload
-; CHECK-NEXT:    ## xmm1 = mem[0],zero
-; CHECK-NEXT:    mulsd %xmm1, %xmm1
 ; CHECK-NEXT:    mulsd %xmm0, %xmm0
-; CHECK-NEXT:    addsd %xmm1, %xmm0
-; CHECK-NEXT:    movsd %xmm0, (%esi)
-; CHECK-NEXT:    addl $24, %esp
-; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    mulsd %xmm1, %xmm1
+; CHECK-NEXT:    addsd %xmm0, %xmm1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movsd %xmm1, (%eax)
+; CHECK-NEXT:    addl $28, %esp
 ; CHECK-NEXT:    retl
 entry:
 	%0 = tail call double (...) @test() nounwind		; <double> [#uses=2]
diff --git a/llvm/test/CodeGen/X86/pr34605.ll b/llvm/test/CodeGen/X86/pr34605.ll
index 25dd6a7436a8a..d87be794076f0 100644
--- a/llvm/test/CodeGen/X86/pr34605.ll
+++ b/llvm/test/CodeGen/X86/pr34605.ll
@@ -4,7 +4,15 @@
 define void @pr34605(ptr nocapture %s, i32 %p) {
 ; CHECK-LABEL: pr34605:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    vmovups %zmm0, 448(%eax)
+; CHECK-NEXT:    vmovups %zmm0, 384(%eax)
+; CHECK-NEXT:    vmovups %zmm0, 320(%eax)
+; CHECK-NEXT:    vmovups %zmm0, 256(%eax)
+; CHECK-NEXT:    vmovups %zmm0, 192(%eax)
+; CHECK-NEXT:    vmovups %zmm0, 128(%eax)
+; CHECK-NEXT:    vmovups %zmm0, 64(%eax)
 ; CHECK-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm0
 ; CHECK-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %k0
 ; CHECK-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %k1
@@ -19,14 +27,6 @@ define void @pr34605(ptr nocapture %s, i32 %p) {
 ; CHECK-NEXT:    kandq %k1, %k0, %k1
 ; CHECK-NEXT:    vmovdqu8 {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%eax)
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqu64 %zmm0, 64(%eax)
-; CHECK-NEXT:    vmovdqu64 %zmm0, 128(%eax)
-; CHECK-NEXT:    vmovdqu64 %zmm0, 192(%eax)
-; CHECK-NEXT:    vmovdqu64 %zmm0, 256(%eax)
-; CHECK-NEXT:    vmovdqu64 %zmm0, 320(%eax)
-; CHECK-NEXT:    vmovdqu64 %zmm0, 384(%eax)
-; CHECK-NEXT:    vmovdqu64 %zmm0, 448(%eax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/pr34855.ll b/llvm/test/CodeGen/X86/pr34855.ll
index 1a22b85ef761b..a1d7108cf9c49 100644
--- a/llvm/test/CodeGen/X86/pr34855.ll
+++ b/llvm/test/CodeGen/X86/pr34855.ll
@@ -6,8 +6,8 @@ define void @PR34855(ptr%p0, ptr%p1, ptr%p2) {
 ; X86-LABEL: PR34855:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/pr35918.ll b/llvm/test/CodeGen/X86/pr35918.ll
index f57fab3084a9e..214b66f239151 100644
--- a/llvm/test/CodeGen/X86/pr35918.ll
+++ b/llvm/test/CodeGen/X86/pr35918.ll
@@ -8,15 +8,15 @@ define void @fetch_r16g16_snorm_unorm8(ptr, ptr, i32, i32, ptr) nounwind {
 ; X86-LABEL: fetch_r16g16_snorm_unorm8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpsrlw $7, %xmm0, %xmm0
 ; X86-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
-; X86-NEXT:    vmovd %xmm0, %ecx
-; X86-NEXT:    orl $-16777216, %ecx # imm = 0xFF000000
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fetch_r16g16_snorm_unorm8:
diff --git a/llvm/test/CodeGen/X86/pr35972.ll b/llvm/test/CodeGen/X86/pr35972.ll
index 981c47800c0f3..c653d31683d63 100644
--- a/llvm/test/CodeGen/X86/pr35972.ll
+++ b/llvm/test/CodeGen/X86/pr35972.ll
@@ -4,12 +4,12 @@
 define void @test3(i32 %c, ptr %ptr) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    sbbl %ecx, %ecx
-; CHECK-NEXT:    kmovd %ecx, %k0
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    kmovd %eax, %k0
 ; CHECK-NEXT:    kunpckdq %k0, %k0, %k0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    kmovq %k0, (%eax)
 ; CHECK-NEXT:    retl
   %cmp = icmp eq i32 %c, 0
diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll
index 0ad35309b87bb..6c7c6bf3bdbdd 100644
--- a/llvm/test/CodeGen/X86/pr35982.ll
+++ b/llvm/test/CodeGen/X86/pr35982.ll
@@ -7,14 +7,14 @@ define float @PR35982_emms(<1 x i64>) nounwind {
 ; NO-POSTRA:       # %bb.0:
 ; NO-POSTRA-NEXT:    subl $8, %esp
 ; NO-POSTRA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; NO-POSTRA-NEXT:    movl %eax, (%esp)
 ; NO-POSTRA-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; NO-POSTRA-NEXT:    punpckhdq %mm0, %mm0 # mm0 = mm0[1,1]
-; NO-POSTRA-NEXT:    movd %mm0, %ecx
-; NO-POSTRA-NEXT:    emms
-; NO-POSTRA-NEXT:    movl %eax, (%esp)
+; NO-POSTRA-NEXT:    movd %mm0, %eax
+; NO-POSTRA-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; NO-POSTRA-NEXT:    fildl (%esp)
-; NO-POSTRA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; NO-POSTRA-NEXT:    fiaddl {{[0-9]+}}(%esp)
+; NO-POSTRA-NEXT:    emms
 ; NO-POSTRA-NEXT:    addl $8, %esp
 ; NO-POSTRA-NEXT:    retl
 ;
@@ -24,12 +24,12 @@ define float @PR35982_emms(<1 x i64>) nounwind {
 ; POSTRA-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; POSTRA-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; POSTRA-NEXT:    punpckhdq %mm0, %mm0 # mm0 = mm0[1,1]
-; POSTRA-NEXT:    movd %mm0, %ecx
-; POSTRA-NEXT:    emms
 ; POSTRA-NEXT:    movl %eax, (%esp)
+; POSTRA-NEXT:    movd %mm0, %eax
+; POSTRA-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; POSTRA-NEXT:    fildl (%esp)
-; POSTRA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; POSTRA-NEXT:    fiaddl {{[0-9]+}}(%esp)
+; POSTRA-NEXT:    emms
 ; POSTRA-NEXT:    addl $8, %esp
 ; POSTRA-NEXT:    retl
   %2 = bitcast <1 x i64> %0 to <2 x i32>
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index eecd15dd2afe9..2c63c79c03766 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -23,103 +23,100 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movzbl (%eax), %eax
-; X86-NEXT:    movzbl (%eax), %ecx
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    divb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    shll $30, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $30, %ecx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    shll $30, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    sarl $30, %ecx
 ; X86-NEXT:    shrdl $1, %eax, %ecx
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    shldl $30, %ebx, %ecx
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shrdl $2, %ebx, %edx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shldl $30, %edi, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    shrdl $2, %edi, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT:    bsrl %edx, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    bsrl %ecx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    bsrl %ecx, %eax
-; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    bsrl %eax, %ebx
+; X86-NEXT:    xorl $31, %ebx
 ; X86-NEXT:  .LBB0_3: # %BB_udiv-special-cases
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $30, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $30, %edx
 ; X86-NEXT:    jne .LBB0_4
 ; X86-NEXT:  # %bb.5: # %BB_udiv-special-cases
-; X86-NEXT:    movl $64, %esi
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl $64, %edx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    je .LBB0_7
 ; X86-NEXT:    jmp .LBB0_8
 ; X86-NEXT:  .LBB0_4:
-; X86-NEXT:    bsrl %esi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    bsrl %edx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    jne .LBB0_8
 ; X86-NEXT:  .LBB0_7: # %BB_udiv-special-cases
-; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:  .LBB0_8: # %BB_udiv-special-cases
-; X86-NEXT:    addl $-66, %eax
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    addl $-66, %edx
 ; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    adcl $3, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movb $1, %cl
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    adcl $3, %eax
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl (%eax), %eax
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movzbl (%eax), %ecx
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    divb %cl
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    jne .LBB0_10
 ; X86-NEXT:  # %bb.9: # %select.false.sink
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl $65, %edx
-; X86-NEXT:    cmpl %eax, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    movl $65, %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:  .LBB0_10: # %select.end
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    jne .LBB0_15
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    jne .LBB0_16
 ; X86-NEXT:  # %bb.11: # %select.end
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl $65, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    je .LBB0_15
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    xorl $65, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    je .LBB0_16
 ; X86-NEXT:  # %bb.12: # %udiv-bb1
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl $1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb $65, %cl
-; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -128,23 +125,31 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 128(%esp,%edx), %eax
-; X86-NEXT:    movl 132(%esp,%edx), %edi
-; X86-NEXT:    movl 136(%esp,%edx), %edx
-; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl 132(%esp,%eax), %edx
+; X86-NEXT:    movl 136(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    je .LBB0_15
 ; X86-NEXT:  # %bb.13: # %udiv-preheader
 ; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -154,97 +159,108 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-NEXT:    movl 84(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shrdl %cl, %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %eax, %ebx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    andb $12, %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%ecx), %eax
+; X86-NEXT:    movl 84(%esp,%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $3, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $3, %eax
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_14: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ecx
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    andl $2, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    leal (%esi,%ebx,2), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl $2, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal (%edx,%edi,2), %edi
+; X86-NEXT:    cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    shll $30, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    sarl $30, %edx
+; X86-NEXT:    shrdl $1, %ebx, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    shll $30, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    sarl $30, %esi
-; X86-NEXT:    shrdl $1, %edi, %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    subl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    shrdl $31, %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %edi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $3, %ebx
-; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $3, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_14
-; X86-NEXT:  .LBB0_15: # %udiv-end
+; X86-NEXT:  .LBB0_15: # %udiv-loop-exit
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:  .LBB0_16: # %udiv-end
 ; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    setne (%eax)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movb $0, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll
index 205849e7d05db..9fd473f7dd8a6 100644
--- a/llvm/test/CodeGen/X86/pr38738.ll
+++ b/llvm/test/CodeGen/X86/pr38738.ll
@@ -129,8 +129,8 @@ define void @tryset(ptr nocapture %x) {
 ;
 ; X86SSE2-LABEL: tryset:
 ; X86SSE2:       # %bb.0:
-; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86SSE2-NEXT:    movsd %xmm0, 56(%eax)
 ; X86SSE2-NEXT:    movsd %xmm0, 48(%eax)
 ; X86SSE2-NEXT:    movsd %xmm0, 40(%eax)
@@ -151,8 +151,8 @@ define void @tryset(ptr nocapture %x) {
 ;
 ; X86AVX-LABEL: tryset:
 ; X86AVX:       # %bb.0:
-; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vmovups %ymm0, 32(%eax)
 ; X86AVX-NEXT:    vmovups %ymm0, (%eax)
 ; X86AVX-NEXT:    vzeroupper
@@ -177,23 +177,23 @@ define void @trycpy(ptr nocapture %x, ptr nocapture readonly %y) {
 ; X86SSE-LABEL: trycpy:
 ; X86SSE:       # %bb.0:
 ; X86SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE-NEXT:    movl 28(%eax), %edx
 ; X86SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86SSE-NEXT:    movl 28(%ecx), %edx
-; X86SSE-NEXT:    movl %edx, 28(%eax)
-; X86SSE-NEXT:    movl 24(%ecx), %edx
-; X86SSE-NEXT:    movl %edx, 24(%eax)
-; X86SSE-NEXT:    movl 20(%ecx), %edx
-; X86SSE-NEXT:    movl %edx, 20(%eax)
-; X86SSE-NEXT:    movl 16(%ecx), %edx
-; X86SSE-NEXT:    movl %edx, 16(%eax)
-; X86SSE-NEXT:    movl 12(%ecx), %edx
-; X86SSE-NEXT:    movl %edx, 12(%eax)
-; X86SSE-NEXT:    movl 8(%ecx), %edx
-; X86SSE-NEXT:    movl %edx, 8(%eax)
-; X86SSE-NEXT:    movl (%ecx), %edx
-; X86SSE-NEXT:    movl 4(%ecx), %ecx
-; X86SSE-NEXT:    movl %ecx, 4(%eax)
-; X86SSE-NEXT:    movl %edx, (%eax)
+; X86SSE-NEXT:    movl %edx, 28(%ecx)
+; X86SSE-NEXT:    movl 24(%eax), %edx
+; X86SSE-NEXT:    movl %edx, 24(%ecx)
+; X86SSE-NEXT:    movl 20(%eax), %edx
+; X86SSE-NEXT:    movl %edx, 20(%ecx)
+; X86SSE-NEXT:    movl 16(%eax), %edx
+; X86SSE-NEXT:    movl %edx, 16(%ecx)
+; X86SSE-NEXT:    movl 12(%eax), %edx
+; X86SSE-NEXT:    movl %edx, 12(%ecx)
+; X86SSE-NEXT:    movl 8(%eax), %edx
+; X86SSE-NEXT:    movl %edx, 8(%ecx)
+; X86SSE-NEXT:    movl (%eax), %edx
+; X86SSE-NEXT:    movl 4(%eax), %eax
+; X86SSE-NEXT:    movl %eax, 4(%ecx)
+; X86SSE-NEXT:    movl %edx, (%ecx)
 ; X86SSE-NEXT:    retl
 ;
 ; X64SSE2-LABEL: trycpy:
@@ -211,15 +211,15 @@ define void @trycpy(ptr nocapture %x, ptr nocapture readonly %y) {
 ; X86SSE2-LABEL: trycpy:
 ; X86SSE2:       # %bb.0:
 ; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86SSE2-NEXT:    movsd %xmm0, 24(%eax)
+; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86SSE2-NEXT:    movsd %xmm0, 24(%ecx)
 ; X86SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86SSE2-NEXT:    movsd %xmm0, 16(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, 16(%ecx)
 ; X86SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86SSE2-NEXT:    movsd %xmm1, 8(%eax)
-; X86SSE2-NEXT:    movsd %xmm0, (%eax)
+; X86SSE2-NEXT:    movsd %xmm1, 8(%ecx)
+; X86SSE2-NEXT:    movsd %xmm0, (%ecx)
 ; X86SSE2-NEXT:    retl
 ;
 ; X64AVX-LABEL: trycpy:
@@ -232,8 +232,8 @@ define void @trycpy(ptr nocapture %x, ptr nocapture readonly %y) {
 ; X86AVX-LABEL: trycpy:
 ; X86AVX:       # %bb.0:
 ; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86AVX-NEXT:    vmovups (%ecx), %ymm0
+; X86AVX-NEXT:    vmovups (%eax), %ymm0
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86AVX-NEXT:    vmovups %ymm0, (%eax)
 ; X86AVX-NEXT:    vzeroupper
 ; X86AVX-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index 6a0c13526ac18..2c0819c389584 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -52,11 +52,11 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_3: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movl $0, h
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    cltd
 ; CHECK-NEXT:    idivl a
 ; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    movl $0, h
 ; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
 ; CHECK-NEXT:    cmpb $8, %dh
 ; CHECK-NEXT:    jg .LBB0_7
diff --git a/llvm/test/CodeGen/X86/pr38819.ll b/llvm/test/CodeGen/X86/pr38819.ll
index e227586400c90..cf91bb6b38878 100644
--- a/llvm/test/CodeGen/X86/pr38819.ll
+++ b/llvm/test/CodeGen/X86/pr38819.ll
@@ -7,10 +7,12 @@ define void @foo(i64 %x, ptr %b) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
 ; CHECK-NEXT:    calll __floatdisf
 ; CHECK-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll
index a920efbec59ea..3cac9cc5c64ac 100644
--- a/llvm/test/CodeGen/X86/pr40539.ll
+++ b/llvm/test/CodeGen/X86/pr40539.ll
@@ -40,19 +40,19 @@ define zeroext i1 @_Z8test_cosv() {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = [8.60000014E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; CHECK-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [8.60000014E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    fcos
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    fstps (%esp)
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    cmpless %xmm0, %xmm1
-; CHECK-NEXT:    cmpless {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; CHECK-NEXT:    andps %xmm1, %xmm0
-; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    cmpless %xmm1, %xmm0
+; CHECK-NEXT:    cmpless {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; CHECK-NEXT:    andps %xmm0, %xmm1
+; CHECK-NEXT:    movd %xmm1, %eax
 ; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/pr44396.ll b/llvm/test/CodeGen/X86/pr44396.ll
index e33c65fb35820..c40d06de97ad7 100644
--- a/llvm/test/CodeGen/X86/pr44396.ll
+++ b/llvm/test/CodeGen/X86/pr44396.ll
@@ -23,10 +23,9 @@ define double @c() nounwind {
 ; CHECK-NEXT:    fildll (%esp)
 ; CHECK-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%edx,4)
 ; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    fxch %st(1)
 ; CHECK-NEXT:    fcmovne %st(1), %st
 ; CHECK-NEXT:    fstp %st(1)
 ; CHECK-NEXT:    addl $16, %esp
diff --git a/llvm/test/CodeGen/X86/pr44915.ll b/llvm/test/CodeGen/X86/pr44915.ll
index 99205ab60ae11..298452ae497cf 100644
--- a/llvm/test/CodeGen/X86/pr44915.ll
+++ b/llvm/test/CodeGen/X86/pr44915.ll
@@ -5,35 +5,31 @@
 define i32 @extract3(ptr, i32) nounwind {
 ; X86-LABEL: extract3:
 ; X86:       # %bb.0: # %_L1
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 24(%esp), %esi
-; X86-NEXT:    andl $7, %esi
-; X86-NEXT:    movl 20(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrb $3, %cl
+; X86-NEXT:    movl 12(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $12, %ecx
+; X86-NEXT:    movb %cl, 4(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $9, %ecx
 ; X86-NEXT:    andb $7, %cl
-; X86-NEXT:    movb %bl, %ch
-; X86-NEXT:    andb $7, %ch
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrl $6, %eax
+; X86-NEXT:    movb %cl, 3(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $6, %ecx
+; X86-NEXT:    andb $7, %cl
+; X86-NEXT:    movb %cl, 2(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $7, %cl
+; X86-NEXT:    movb %cl, (%esp)
+; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $7, %al
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shrl $9, %edx
-; X86-NEXT:    andb $7, %dl
-; X86-NEXT:    shrl $12, %ebx
-; X86-NEXT:    movb %bl, 4(%esp)
-; X86-NEXT:    movb %dl, 3(%esp)
-; X86-NEXT:    movb %al, 2(%esp)
-; X86-NEXT:    movb %ch, (%esp)
-; X86-NEXT:    movb %cl, 1(%esp)
-; X86-NEXT:    movzbl (%esp,%esi), %eax
+; X86-NEXT:    movb %al, 1(%esp)
+; X86-NEXT:    movl 16(%esp), %eax
+; X86-NEXT:    andl $7, %eax
+; X86-NEXT:    movzbl (%esp,%eax), %eax
 ; X86-NEXT:    andl $7, %eax
 ; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extract3:
diff --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll
index a624055dc1041..89e78be5a88a4 100644
--- a/llvm/test/CodeGen/X86/pr46527.ll
+++ b/llvm/test/CodeGen/X86/pr46527.ll
@@ -11,19 +11,19 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) {
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    notb %dl
-; CHECK-NEXT:    andb $1, %dl
-; CHECK-NEXT:    movzbl %dl, %edx
-; CHECK-NEXT:    movd %edx, %xmm1
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    notb %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movzbl %cl, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm1
 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
 ; CHECK-NEXT:    paddb %xmm1, %xmm1
 ; CHECK-NEXT:    pxor %xmm0, %xmm1
 ; CHECK-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%ecx)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movdqa %xmm1, (%eax)
 ; CHECK-NEXT:    retl
 entry:
   %0 = select i1 %flag, i8 0, i8 2
diff --git a/llvm/test/CodeGen/X86/pr49028.ll b/llvm/test/CodeGen/X86/pr49028.ll
index 8df070356bb89..1528673803b4e 100644
--- a/llvm/test/CodeGen/X86/pr49028.ll
+++ b/llvm/test/CodeGen/X86/pr49028.ll
@@ -5,9 +5,9 @@
 define zeroext i16 @PR49028(i16 zeroext %0, ptr %1) {
 ; X86-LABEL: PR49028:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sete (%ecx)
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll
index 1a7551f6117e8..62ab31d55d217 100644
--- a/llvm/test/CodeGen/X86/pr49451.ll
+++ b/llvm/test/CodeGen/X86/pr49451.ll
@@ -10,9 +10,9 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    # implicit-def: $si
 ; X86-NEXT:    .p2align 4
@@ -22,12 +22,12 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind {
 ; X86-NEXT:    je .LBB0_2
 ; X86-NEXT:  # %bb.3: # %if.end1401
 ; X86-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movw %si, s_2
-; X86-NEXT:    movw %dx, s_0
-; X86-NEXT:    incl %ecx
+; X86-NEXT:    movw %ax, s_0
+; X86-NEXT:    incl %eax
 ; X86-NEXT:    incl %edx
-; X86-NEXT:    cmpw $73, %cx
+; X86-NEXT:    cmpw $73, %dx
 ; X86-NEXT:    jl .LBB0_1
 ; X86-NEXT:  # %bb.4: # %for.body1703
 ; X86-NEXT:  .LBB0_2: # %if.then671
diff --git a/llvm/test/CodeGen/X86/pr50782.ll b/llvm/test/CodeGen/X86/pr50782.ll
index 591a33446d4e3..734a06947877e 100644
--- a/llvm/test/CodeGen/X86/pr50782.ll
+++ b/llvm/test/CodeGen/X86/pr50782.ll
@@ -23,7 +23,6 @@ define void @h(float %i) {
 ; CHECK-NEXT:    subl $32, %esp
 ; CHECK-NEXT:    movl %esp, %esi
 ; CHECK-NEXT:    .cfi_offset %esi, -12
-; CHECK-NEXT:    flds 8(%ebp)
 ; CHECK-NEXT:    movl _a, %ecx
 ; CHECK-NEXT:    leal 3(%ecx), %eax
 ; CHECK-NEXT:    andl $-4, %eax
@@ -31,6 +30,7 @@ define void @h(float %i) {
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    andl $-16, %eax
 ; CHECK-NEXT:    movl %eax, %esp
+; CHECK-NEXT:    flds 8(%ebp)
 ; CHECK-NEXT:    fsts 8(%esi) # 4-byte Folded Spill
 ; CHECK-NEXT:    fadds _b
 ; CHECK-NEXT:    fsts _d
@@ -49,8 +49,8 @@ define void @h(float %i) {
 ; CHECK-NEXT:  LBB0_2: # %for.cond1.preheader
 ; CHECK-NEXT:    movl _e, %ecx
 ; CHECK-NEXT:    movl %ecx, 12(%esi)
-; CHECK-NEXT:    fildl 12(%esi)
 ; CHECK-NEXT:    movl _c, %edx
+; CHECK-NEXT:    fildl 12(%esi)
 ; CHECK-NEXT:    jmp LBB0_3
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_5: # %for.inc
diff --git a/llvm/test/CodeGen/X86/pr51878_computeAliasing.ll b/llvm/test/CodeGen/X86/pr51878_computeAliasing.ll
index cf50d0fd5ceef..899e8edb8f3f8 100644
--- a/llvm/test/CodeGen/X86/pr51878_computeAliasing.ll
+++ b/llvm/test/CodeGen/X86/pr51878_computeAliasing.ll
@@ -16,8 +16,8 @@
 define i16 @main() {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movw $1, foo
 ; CHECK-NEXT:    movw $2, bar
+; CHECK-NEXT:    movw $1, foo
 ; CHECK-NEXT:    movw $4, aliasFoo
 ; CHECK-NEXT:    movzwl foo, %eax
 ; CHECK-NEXT:    addw bar, %ax
diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll
index 750751d404f38..e598b866a1c49 100644
--- a/llvm/test/CodeGen/X86/pr53419.ll
+++ b/llvm/test/CodeGen/X86/pr53419.ll
@@ -23,9 +23,9 @@ define i1 @intrinsic_v2i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: intrinsic_v2i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -47,9 +47,9 @@ define i1 @intrinsic_v4i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: intrinsic_v4i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -71,12 +71,12 @@ define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: intrinsic_v8i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -98,9 +98,9 @@ define i1 @vector_version_v2i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: vector_version_v2i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -123,9 +123,9 @@ define i1 @vector_version_v4i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: vector_version_v4i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -148,12 +148,12 @@ define i1 @vector_version_v8i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: vector_version_v8i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -176,9 +176,9 @@ define i1 @mixed_version_v2i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: mixed_version_v2i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -201,9 +201,9 @@ define i1 @mixed_version_v4i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: mixed_version_v4i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -226,12 +226,12 @@ define i1 @mixed_version_v8i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: mixed_version_v8i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -254,9 +254,9 @@ define i1 @scalar_version_i16(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: scalar_version_i16:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    cmpw (%ecx), %ax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -277,9 +277,9 @@ define i1 @scalar_version_i32(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: scalar_version_i32:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    cmpl (%ecx), %eax
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -300,12 +300,12 @@ define i1 @scalar_version_i64(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X86-LABEL: scalar_version_i64:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl 4(%edx), %eax
+; X86-NEXT:    xorl (%edx), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
diff --git a/llvm/test/CodeGen/X86/pr57283.ll b/llvm/test/CodeGen/X86/pr57283.ll
index 25e5ac72ab70c..e67bbbac5e854 100644
--- a/llvm/test/CodeGen/X86/pr57283.ll
+++ b/llvm/test/CodeGen/X86/pr57283.ll
@@ -10,9 +10,9 @@ define void @PR57283() nounwind {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll
index d8738081842a3..2aa46f9f341b9 100644
--- a/llvm/test/CodeGen/X86/pr59305.ll
+++ b/llvm/test/CodeGen/X86/pr59305.ll
@@ -32,15 +32,13 @@ define double @foo(double %0) #0 {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $60, %esp
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; X86-NEXT:    wait
 ; X86-NEXT:    movl $1024, (%esp) # imm = 0x400
 ; X86-NEXT:    calll fesetround at PLT
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
 ; X86-NEXT:    fld1
 ; X86-NEXT:    fstl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; X86-NEXT:    fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
-; X86-NEXT:    fdivrp %st, %st(1)
+; X86-NEXT:    fdivp %st, %st(1)
 ; X86-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
 ; X86-NEXT:    wait
 ; X86-NEXT:    movl $1024, (%esp) # imm = 0x400
@@ -101,11 +99,10 @@ define double @bar(double %0) #0 {
 ; X86-LABEL: bar:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    wait
 ; X86-NEXT:    #APP
 ; X86-NEXT:    fldcw 0
 ; X86-NEXT:    #NO_APP
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NEXT:    fld1
 ; X86-NEXT:    fld %st(0)
 ; X86-NEXT:    fdiv %st(2), %st
diff --git a/llvm/test/CodeGen/X86/pr62145.ll b/llvm/test/CodeGen/X86/pr62145.ll
index 38208422be6b4..9a27a1678c841 100644
--- a/llvm/test/CodeGen/X86/pr62145.ll
+++ b/llvm/test/CodeGen/X86/pr62145.ll
@@ -7,16 +7,16 @@ define void @f(i64 %a, i64 %b) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $-65536, %edi # imm = 0xFFFF0000
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $-65536, %edi # imm = 0xFFFF0000
 ; X86-NEXT:    cmpl $65527, %eax # imm = 0xFFF7
 ; X86-NEXT:    jne .LBB0_2
 ; X86-NEXT:  # %bb.1: # %if.then
 ; X86-NEXT:    calll ext1 at PLT
 ; X86-NEXT:  .LBB0_2: # %if.end
-; X86-NEXT:    calll ext2 at PLT
 ; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    calll ext2 at PLT
 ; X86-NEXT:    cmpl $-589824, %esi # imm = 0xFFF70000
 ; X86-NEXT:    jne .LBB0_3
 ; X86-NEXT:  # %bb.4: # %if.then2
diff --git a/llvm/test/CodeGen/X86/pr69965.ll b/llvm/test/CodeGen/X86/pr69965.ll
index 33bea976c7896..fe8d889366849 100644
--- a/llvm/test/CodeGen/X86/pr69965.ll
+++ b/llvm/test/CodeGen/X86/pr69965.ll
@@ -5,18 +5,18 @@
 define i64 @PR69965(ptr %input_ptrs, ptr %output_ptrs) {
 ; X86-LABEL: PR69965:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shll $8, %edx
-; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $8, %ecx
 ; X86-NEXT:    addb %al, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    orl $32768, %eax # imm = 0x8000
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
 ; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    xorl %edx, %edx
diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
index db77baa7ff8a3..643c5f20ee07e 100644
--- a/llvm/test/CodeGen/X86/pr78897.ll
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -24,14 +24,14 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,0,0,0,0,0,0,0,0]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    movd %xmm2, %esi
 ; X86-SSE2-NEXT:    movd %xmm1, %ecx
 ; X86-SSE2-NEXT:    movl $286331152, %edi # imm = 0x11111110
 ; X86-SSE2-NEXT:    movl %ecx, %eax
 ; X86-SSE2-NEXT:    mull %edi
 ; X86-SSE2-NEXT:    imull $286331153, %ecx, %ebx # imm = 0x11111111
 ; X86-SSE2-NEXT:    addl %edx, %ebx
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    movd %xmm1, %esi
 ; X86-SSE2-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
 ; X86-SSE2-NEXT:    addl %ebx, %edx
 ; X86-SSE2-NEXT:    movd %edx, %xmm2
@@ -40,15 +40,15 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-SSE2-NEXT:    movl %ecx, %eax
 ; X86-SSE2-NEXT:    mull %edi
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE2-NEXT:    xorl $17895697, %esi # imm = 0x1111111
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    imull $286331153, %ecx, %ecx # imm = 0x11111111
 ; X86-SSE2-NEXT:    addl %edx, %ecx
+; X86-SSE2-NEXT:    xorl $17895697, %esi # imm = 0x1111111
 ; X86-SSE2-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
 ; X86-SSE2-NEXT:    addl %ecx, %edx
 ; X86-SSE2-NEXT:    movd %edx, %xmm2
 ; X86-SSE2-NEXT:    movd %eax, %xmm3
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -104,8 +104,8 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-SSE42-NEXT:    movl $286331152, %edi # imm = 0x11111110
 ; X86-SSE42-NEXT:    movl %ecx, %eax
 ; X86-SSE42-NEXT:    mull %edi
-; X86-SSE42-NEXT:    pextrd $1, %xmm1, %esi
 ; X86-SSE42-NEXT:    imull $286331153, %ecx, %ebx # imm = 0x11111111
+; X86-SSE42-NEXT:    pextrd $1, %xmm1, %esi
 ; X86-SSE42-NEXT:    addl %edx, %ebx
 ; X86-SSE42-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
 ; X86-SSE42-NEXT:    addl %ebx, %edx
@@ -114,9 +114,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-SSE42-NEXT:    xorl $286331153, %ecx # imm = 0x11111111
 ; X86-SSE42-NEXT:    movl %ecx, %eax
 ; X86-SSE42-NEXT:    mull %edi
-; X86-SSE42-NEXT:    xorl $17895697, %esi # imm = 0x1111111
 ; X86-SSE42-NEXT:    imull $286331153, %ecx, %ecx # imm = 0x11111111
 ; X86-SSE42-NEXT:    addl %edx, %ecx
+; X86-SSE42-NEXT:    xorl $17895697, %esi # imm = 0x1111111
 ; X86-SSE42-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
 ; X86-SSE42-NEXT:    addl %ecx, %edx
 ; X86-SSE42-NEXT:    movd %eax, %xmm1
@@ -170,22 +170,22 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
 ; X86-AVX2-NEXT:    vmovd %xmm1, %edx
 ; X86-AVX2-NEXT:    movl $286331152, %ecx # imm = 0x11111110
-; X86-AVX2-NEXT:    mulxl %ecx, %edi, %esi
+; X86-AVX2-NEXT:    mulxl %ecx, %esi, %eax
+; X86-AVX2-NEXT:    imull $286331153, %edx, %edi # imm = 0x11111111
+; X86-AVX2-NEXT:    addl %eax, %edi
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; X86-AVX2-NEXT:    imull $286331153, %edx, %ebx # imm = 0x11111111
-; X86-AVX2-NEXT:    addl %esi, %ebx
-; X86-AVX2-NEXT:    imull $286331152, %eax, %esi # imm = 0x11111110
-; X86-AVX2-NEXT:    addl %ebx, %esi
-; X86-AVX2-NEXT:    vmovd %edi, %xmm1
+; X86-AVX2-NEXT:    imull $286331152, %eax, %ebx # imm = 0x11111110
+; X86-AVX2-NEXT:    addl %edi, %ebx
+; X86-AVX2-NEXT:    vmovd %esi, %xmm1
 ; X86-AVX2-NEXT:    xorl $286331153, %edx # imm = 0x11111111
-; X86-AVX2-NEXT:    mulxl %ecx, %edi, %ecx
-; X86-AVX2-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
-; X86-AVX2-NEXT:    xorl $17895697, %eax # imm = 0x1111111
+; X86-AVX2-NEXT:    mulxl %ecx, %esi, %ecx
+; X86-AVX2-NEXT:    vpinsrd $1, %ebx, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    imull $286331153, %edx, %edx # imm = 0x11111111
 ; X86-AVX2-NEXT:    addl %ecx, %edx
+; X86-AVX2-NEXT:    xorl $17895697, %eax # imm = 0x1111111
 ; X86-AVX2-NEXT:    imull $286331152, %eax, %eax # imm = 0x11111110
 ; X86-AVX2-NEXT:    addl %edx, %eax
-; X86-AVX2-NEXT:    vmovd %edi, %xmm2
+; X86-AVX2-NEXT:    vmovd %esi, %xmm2
 ; X86-AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
 ; X86-AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm1
@@ -228,22 +228,22 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-AVX512-NEXT:    kmovd %eax, %k1
 ; X86-AVX512-NEXT:    knotw %k1, %k2
 ; X86-AVX512-NEXT:    vmovdqu8 {{.*#+}} xmm0 {%k2} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
-; X86-AVX512-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX512-NEXT:    vmovd %xmm0, %edx
-; X86-AVX512-NEXT:    movl $286331152, %ecx # imm = 0x11111110
-; X86-AVX512-NEXT:    mulxl %ecx, %edi, %esi
-; X86-AVX512-NEXT:    imull $286331153, %edx, %ebx # imm = 0x11111111
-; X86-AVX512-NEXT:    addl %esi, %ebx
-; X86-AVX512-NEXT:    imull $286331152, %eax, %esi # imm = 0x11111110
-; X86-AVX512-NEXT:    addl %ebx, %esi
-; X86-AVX512-NEXT:    vmovd %edi, %xmm0
-; X86-AVX512-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
-; X86-AVX512-NEXT:    xorl $17895697, %eax # imm = 0x1111111
+; X86-AVX512-NEXT:    movl $286331152, %eax # imm = 0x11111110
+; X86-AVX512-NEXT:    mulxl %eax, %esi, %ecx
+; X86-AVX512-NEXT:    imull $286331153, %edx, %edi # imm = 0x11111111
+; X86-AVX512-NEXT:    addl %ecx, %edi
+; X86-AVX512-NEXT:    vpextrd $1, %xmm0, %ecx
+; X86-AVX512-NEXT:    imull $286331152, %ecx, %ebx # imm = 0x11111110
+; X86-AVX512-NEXT:    addl %edi, %ebx
+; X86-AVX512-NEXT:    vmovd %esi, %xmm0
+; X86-AVX512-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    xorl $286331153, %edx # imm = 0x11111111
-; X86-AVX512-NEXT:    mulxl %ecx, %esi, %ecx
+; X86-AVX512-NEXT:    mulxl %eax, %esi, %eax
 ; X86-AVX512-NEXT:    imull $286331153, %edx, %edx # imm = 0x11111111
-; X86-AVX512-NEXT:    addl %ecx, %edx
-; X86-AVX512-NEXT:    imull $286331152, %eax, %eax # imm = 0x11111110
+; X86-AVX512-NEXT:    addl %eax, %edx
+; X86-AVX512-NEXT:    xorl $17895697, %ecx # imm = 0x1111111
+; X86-AVX512-NEXT:    imull $286331152, %ecx, %eax # imm = 0x11111110
 ; X86-AVX512-NEXT:    addl %edx, %eax
 ; X86-AVX512-NEXT:    vmovd %esi, %xmm1
 ; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll
index ed85e7fe9fb60..35af161fa1ef0 100644
--- a/llvm/test/CodeGen/X86/pr89877.ll
+++ b/llvm/test/CodeGen/X86/pr89877.ll
@@ -9,9 +9,9 @@ define i32 @sext_known_nonzero(i16 %xx) {
 ; X86-NEXT:    movl $256, %eax # imm = 0x100
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    cwtl
-; X86-NEXT:    bsfl %eax, %ecx
-; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $32, %ecx
+; X86-NEXT:    bsfl %eax, %eax
+; X86-NEXT:    cmovel %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sext_known_nonzero:
diff --git a/llvm/test/CodeGen/X86/promote-i16.ll b/llvm/test/CodeGen/X86/promote-i16.ll
index 450285d1b80eb..ad539c8ae3222 100644
--- a/llvm/test/CodeGen/X86/promote-i16.ll
+++ b/llvm/test/CodeGen/X86/promote-i16.ll
@@ -64,10 +64,10 @@ define void @bat(ptr %a, ptr %x, i16 signext %y) nounwind {
 ; X86-LABEL: bat:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    subw {{[0-9]+}}(%esp), %cx
-; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: bat:
diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll
index df1e9d61e3ef4..8c4d271082774 100644
--- a/llvm/test/CodeGen/X86/promote-vec3.ll
+++ b/llvm/test/CodeGen/X86/promote-vec3.ll
@@ -8,8 +8,8 @@
 define <3 x i16> @zext_i8(<3 x i8>) {
 ; SSE3-LABEL: zext_i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE3-NEXT:    # kill: def $dx killed $dx killed $edx
@@ -21,9 +21,9 @@ define <3 x i16> @zext_i8(<3 x i8>) {
 ; SSE41-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrb $1, %edx, %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    movd %xmm0, %eax
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE41-NEXT:    # kill: def $dx killed $dx killed $edx
 ; SSE41-NEXT:    # kill: def $cx killed $cx killed $ecx
@@ -34,9 +34,9 @@ define <3 x i16> @zext_i8(<3 x i8>) {
 ; AVX-32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX-32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; AVX-32-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX-32-NEXT:    vmovd %xmm0, %eax
+; AVX-32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; AVX-32-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-32-NEXT:    # kill: def $dx killed $dx killed $edx
 ; AVX-32-NEXT:    # kill: def $cx killed $cx killed $ecx
@@ -61,12 +61,12 @@ define <3 x i16> @zext_i8(<3 x i8>) {
 define <3 x i16> @sext_i8(<3 x i8>) {
 ; SSE3-LABEL: sext_i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; SSE3-NEXT:    shll $24, %ecx
-; SSE3-NEXT:    shll $8, %eax
-; SSE3-NEXT:    orl %ecx, %eax
-; SSE3-NEXT:    movd %eax, %xmm0
+; SSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE3-NEXT:    shll $24, %eax
+; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; SSE3-NEXT:    shll $8, %ecx
+; SSE3-NEXT:    orl %eax, %ecx
+; SSE3-NEXT:    movd %ecx, %xmm0
 ; SSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE3-NEXT:    shll $8, %eax
 ; SSE3-NEXT:    pinsrw $2, %eax, %xmm0
diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
index a9611fcb55873..e9f7cde777ebb 100644
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+avx512fp16 -mattr=+avx512vl -o - | FileCheck %s
 
 ; This test checks that only a single je gets generated in the final code
@@ -6,6 +7,16 @@
 ; CHECK: je
 ; CHECK-NOT: je
 define <8 x half> @foo1(i32 %v1, <8 x half> %v2, <8 x half> %v3, <8 x half> %v4) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    vmovaps %xmm1, %xmm2
+; CHECK-NEXT:    vmovaps %xmm0, %xmm1
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    vsubph %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp eq i32 %v1, 0
   %t1 = select i1 %cmp, <8 x half> %v2, <8 x half> %v3
@@ -22,6 +33,40 @@ entry:
 ; CHECK: ja
 ; CHECK-NOT: ja
 define void @foo2(i32 %v1,
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-64, %esp
+; CHECK-NEXT:    subl $64, %esp
+; CHECK-NEXT:    vmovaps 72(%ebp), %zmm3
+; CHECK-NEXT:    cmpl $32, 8(%ebp)
+; CHECK-NEXT:    jae .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    vaddph 136(%ebp), %zmm3, %zmm3
+; CHECK-NEXT:  .LBB1_2: # %entry
+; CHECK-NEXT:    movl 200(%ebp), %eax
+; CHECK-NEXT:    vmovaps %zmm3, 52(%eax)
+; CHECK-NEXT:    jae .LBB1_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    vaddph 40(%ebp), %ymm2, %ymm2
+; CHECK-NEXT:  .LBB1_4: # %entry
+; CHECK-NEXT:    vmovaps %ymm2, 20(%eax)
+; CHECK-NEXT:    jae .LBB1_6
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:  .LBB1_6: # %entry
+; CHECK-NEXT:    vmovaps %xmm0, 4(%eax)
+; CHECK-NEXT:    setae %cl
+; CHECK-NEXT:    kmovd %ecx, %k1
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vaddsh 16(%ebp), %xmm0, %xmm1
+; CHECK-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovsh %xmm1, 2(%eax)
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
                   half %v32, half %v33,
                   <8 x half> %v52, <8 x half> %v53,
                   <16 x half> %v122, <16 x half> %v123,
@@ -63,8 +108,13 @@ entry:
 ; for lowering the CMOV pseudos that get created for this IR.
 define dso_local <32 x half> @foo3(<32 x half> %a, <32 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo3:
-; CHECK: jne
-; CHECK-NOT: jne
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:  .LBB2_2: # %entry
+; CHECK-NEXT:    retl
 entry:
   %spec.select = select i1 %sign, <32 x half> %a, <32 x half> %b
   ret <32 x half> %spec.select
@@ -74,8 +124,13 @@ entry:
 ; for lowering the CMOV pseudos that get created for this IR.
 define dso_local <16 x half> @foo4(<16 x half> %a, <16 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo4:
-; CHECK: jne
-; CHECK-NOT: jne
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:  .LBB3_2: # %entry
+; CHECK-NEXT:    retl
 entry:
   %spec.select = select i1 %sign, <16 x half> %a, <16 x half> %b
   ret <16 x half> %spec.select
@@ -85,8 +140,13 @@ entry:
 ; for lowering the CMOV pseudos that get created for this IR.
 define dso_local <8 x half> @foo5(<8 x half> %a, <8 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo5:
-; CHECK: jne
-; CHECK-NOT: jne
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jne .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:  .LBB4_2: # %entry
+; CHECK-NEXT:    retl
 entry:
   %spec.select = select i1 %sign, <8 x half> %a, <8 x half> %b
   ret <8 x half> %spec.select
diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll
index 7d7d72e4c89f6..55335ed594fcd 100644
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s 
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s
 
 ; This test checks that only a single js gets generated in the final code
 ; for lowering the CMOV pseudos that get created for this IR.
@@ -6,6 +7,21 @@
 ; CHECK: js
 ; CHECK-NOT: js
 define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    js .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i32 %v1, 0
   %v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
@@ -22,6 +38,20 @@ entry:
 ; CHECK-NOT: js
 ; CHECK-NOT: jns
 define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
+; CHECK-LABEL: foo11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    js .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp1 = icmp slt i32 %v1, 0
   %v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
@@ -37,6 +67,23 @@ entry:
 ; CHECK: js
 ; CHECK-NOT: js
 define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    js .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:  .LBB2_2: # %entry
+; CHECK-NEXT:    movsbl %cl, %ecx
+; CHECK-NEXT:    js .LBB2_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:  .LBB2_4: # %entry
+; CHECK-NEXT:    movsbl %al, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i8 %v1, 0
   %v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
@@ -53,6 +100,24 @@ entry:
 ; CHECK: js
 ; CHECK-NOT: js
 define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    testw %cx, %cx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    js .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:  .LBB3_2: # %entry
+; CHECK-NEXT:    cwtl
+; CHECK-NEXT:    js .LBB3_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:  .LBB3_4: # %entry
+; CHECK-NEXT:    movswl %cx, %ecx
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i16 %v1, 0
   %v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
@@ -69,6 +134,29 @@ entry:
 ; CHECK: js
 ; CHECK-NOT: js
 define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    js .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB4_2: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    js .LBB4_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB4_4: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i32 %v1, 0
   %t1 = select i1 %cmp, float %v2, float %v3
@@ -83,6 +171,29 @@ entry:
 ; CHECK: je
 ; CHECK-NOT: je
 define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB5_2: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB5_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB5_4: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp eq i32 %v1, 0
   %t1 = select i1 %cmp, double %v2, double %v3
@@ -97,6 +208,91 @@ entry:
 ; CHECK: je
 ; CHECK-NOT: je
 define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
+; CHECK-LABEL: foo6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB6_2: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB6_4: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    fstps 12(%eax)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB6_6
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB6_6: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB6_8
+; CHECK-NEXT:  # %bb.7: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB6_8: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    fstps 8(%eax)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB6_10
+; CHECK-NEXT:  # %bb.9: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB6_10: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB6_12
+; CHECK-NEXT:  # %bb.11: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB6_12: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB6_14
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB6_14: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB6_16
+; CHECK-NEXT:  # %bb.15: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB6_16: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    retl $4
 entry:
   %cmp = icmp eq i32 %v1, 0
   %t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
@@ -111,6 +307,48 @@ entry:
 ; CHECK: je
 ; CHECK-NOT: je
 define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
+; CHECK-LABEL: foo7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB7_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB7_2: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB7_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB7_4: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(1)
+; CHECK-NEXT:    je .LBB7_6
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:  .LBB7_6: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB7_8
+; CHECK-NEXT:  # %bb.7: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:    fxch %st(2)
+; CHECK-NEXT:  .LBB7_8: # %entry
+; CHECK-NEXT:    fstp %st(2)
+; CHECK-NEXT:    fsubrp %st, %st(1)
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp eq i32 %v1, 0
   %t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
@@ -127,6 +365,349 @@ entry:
 ; CHECK: ja
 ; CHECK-NOT: ja
 define void @foo8(i32 %v1,
+; CHECK-LABEL: foo8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    xorl $11, %eax
+; CHECK-NEXT:    movl $1234, %edx # imm = 0x4D2
+; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    cmpl $32, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_2: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    fstps 124(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_4: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 120(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_6
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_6: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 116(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_8
+; CHECK-NEXT:  # %bb.7: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_8: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 112(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_10
+; CHECK-NEXT:  # %bb.9: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_10: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 108(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_12
+; CHECK-NEXT:  # %bb.11: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_12: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 104(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_14
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_14: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 100(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_16
+; CHECK-NEXT:  # %bb.15: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_16: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 96(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_18
+; CHECK-NEXT:  # %bb.17: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_18: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 92(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_20
+; CHECK-NEXT:  # %bb.19: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_20: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 88(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_22
+; CHECK-NEXT:  # %bb.21: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_22: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 84(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_24
+; CHECK-NEXT:  # %bb.23: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_24: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 80(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_26
+; CHECK-NEXT:  # %bb.25: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_26: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 76(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_28
+; CHECK-NEXT:  # %bb.27: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_28: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 72(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_30
+; CHECK-NEXT:  # %bb.29: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_30: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 68(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrs {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_32
+; CHECK-NEXT:  # %bb.31: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_32: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 64(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    faddl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_34
+; CHECK-NEXT:  # %bb.33: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_34: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 56(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    faddl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_36
+; CHECK-NEXT:  # %bb.35: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_36: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 48(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fadds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_38
+; CHECK-NEXT:  # %bb.37: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_38: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 44(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fadds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_40
+; CHECK-NEXT:  # %bb.39: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_40: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 40(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fadds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_42
+; CHECK-NEXT:  # %bb.41: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_42: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 36(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fadds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_44
+; CHECK-NEXT:  # %bb.43: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_44: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 32(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_46
+; CHECK-NEXT:  # %bb.45: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_46: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 184(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_48
+; CHECK-NEXT:  # %bb.47: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_48: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 176(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_50
+; CHECK-NEXT:  # %bb.49: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_50: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 168(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_52
+; CHECK-NEXT:  # %bb.51: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_52: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 160(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_54
+; CHECK-NEXT:  # %bb.53: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_54: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 152(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_56
+; CHECK-NEXT:  # %bb.55: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_56: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 144(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_58
+; CHECK-NEXT:  # %bb.57: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_58: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 136(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fsubrl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_60
+; CHECK-NEXT:  # %bb.59: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_60: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 128(%ecx)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    faddl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_62
+; CHECK-NEXT:  # %bb.61: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_62: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstpl 16(%ecx)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fld %st(0)
+; CHECK-NEXT:    fadds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    jae .LBB8_64
+; CHECK-NEXT:  # %bb.63: # %entry
+; CHECK-NEXT:    fstp %st(1)
+; CHECK-NEXT:    fldz
+; CHECK-NEXT:  .LBB8_64: # %entry
+; CHECK-NEXT:    fstp %st(0)
+; CHECK-NEXT:    fstps 8(%ecx)
+; CHECK-NEXT:    jae .LBB8_65
+; CHECK-NEXT:  # %bb.66: # %entry
+; CHECK-NEXT:    movl %edx, 4(%ecx)
+; CHECK-NEXT:    jae .LBB8_67
+; CHECK-NEXT:  .LBB8_68: # %entry
+; CHECK-NEXT:    movw %ax, 2(%ecx)
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB8_65:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %edx, 4(%ecx)
+; CHECK-NEXT:    jb .LBB8_68
+; CHECK-NEXT:  .LBB8_67:
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movw %ax, 2(%ecx)
+; CHECK-NEXT:    retl
                   i8 %v2, i8 %v3,
                   i16 %v12, i16 %v13,
                   i32 %v22, i32 %v23,
@@ -216,6 +797,1364 @@ entry:
 ; CHECK: ja
 ; CHECK-NOT: ja
 define void @foo9(i32 %v1,
+; CHECK-LABEL: foo9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $144, %esp
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_2: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_4
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_4: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_6
+; CHECK-NEXT:  # %bb.5: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_6: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_8
+; CHECK-NEXT:  # %bb.7: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_8: # %entry
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_10
+; CHECK-NEXT:  # %bb.9: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_10: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_12
+; CHECK-NEXT:  # %bb.11: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_12: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_14
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_14: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_16
+; CHECK-NEXT:  # %bb.15: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_16: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_18
+; CHECK-NEXT:  # %bb.17: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_18: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_20
+; CHECK-NEXT:  # %bb.19: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_20: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_22
+; CHECK-NEXT:  # %bb.21: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_22: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_24
+; CHECK-NEXT:  # %bb.23: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_24: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_26
+; CHECK-NEXT:  # %bb.25: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_26: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_28
+; CHECK-NEXT:  # %bb.27: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_28: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    movl %ecx, %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_30
+; CHECK-NEXT:  # %bb.29: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_30: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_32
+; CHECK-NEXT:  # %bb.31: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_32: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_34
+; CHECK-NEXT:  # %bb.33: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_34: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_36
+; CHECK-NEXT:  # %bb.35: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_36: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_38
+; CHECK-NEXT:  # %bb.37: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_38: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_40
+; CHECK-NEXT:  # %bb.39: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_40: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_42
+; CHECK-NEXT:  # %bb.41: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_42: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_44
+; CHECK-NEXT:  # %bb.43: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_44: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_46
+; CHECK-NEXT:  # %bb.45: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_46: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_48
+; CHECK-NEXT:  # %bb.47: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_48: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_50
+; CHECK-NEXT:  # %bb.49: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_50: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_52
+; CHECK-NEXT:  # %bb.51: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_52: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_54
+; CHECK-NEXT:  # %bb.53: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_54: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_56
+; CHECK-NEXT:  # %bb.55: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_56: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_58
+; CHECK-NEXT:  # %bb.57: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_58: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_60
+; CHECK-NEXT:  # %bb.59: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_60: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_62
+; CHECK-NEXT:  # %bb.61: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_62: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_64
+; CHECK-NEXT:  # %bb.63: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_64: # %entry
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_66
+; CHECK-NEXT:  # %bb.65: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_66: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_68
+; CHECK-NEXT:  # %bb.67: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_68: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_70
+; CHECK-NEXT:  # %bb.69: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_70: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_72
+; CHECK-NEXT:  # %bb.71: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_72: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_74
+; CHECK-NEXT:  # %bb.73: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_74: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_76
+; CHECK-NEXT:  # %bb.75: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_76: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_78
+; CHECK-NEXT:  # %bb.77: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_78: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_80
+; CHECK-NEXT:  # %bb.79: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_80: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_82
+; CHECK-NEXT:  # %bb.81: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_82: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_84
+; CHECK-NEXT:  # %bb.83: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_84: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_86
+; CHECK-NEXT:  # %bb.85: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_86: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_88
+; CHECK-NEXT:  # %bb.87: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_88: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_90
+; CHECK-NEXT:  # %bb.89: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_90: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_92
+; CHECK-NEXT:  # %bb.91: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_92: # %entry
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_94
+; CHECK-NEXT:  # %bb.93: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_94: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_96
+; CHECK-NEXT:  # %bb.95: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_96: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_98
+; CHECK-NEXT:  # %bb.97: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_98: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_100
+; CHECK-NEXT:  # %bb.99: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_100: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_102
+; CHECK-NEXT:  # %bb.101: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_102: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_104
+; CHECK-NEXT:  # %bb.103: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_104: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_106
+; CHECK-NEXT:  # %bb.105: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_106: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_108
+; CHECK-NEXT:  # %bb.107: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_108: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_110
+; CHECK-NEXT:  # %bb.109: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_110: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_112
+; CHECK-NEXT:  # %bb.111: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_112: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_114
+; CHECK-NEXT:  # %bb.113: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_114: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_116
+; CHECK-NEXT:  # %bb.115: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_116: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_118
+; CHECK-NEXT:  # %bb.117: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_118: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_120
+; CHECK-NEXT:  # %bb.119: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_120: # %entry
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_122
+; CHECK-NEXT:  # %bb.121: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_122: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_124
+; CHECK-NEXT:  # %bb.123: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:  .LBB9_124: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_126
+; CHECK-NEXT:  # %bb.125: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_126: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_128
+; CHECK-NEXT:  # %bb.127: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_128: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_130
+; CHECK-NEXT:  # %bb.129: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_130: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_132
+; CHECK-NEXT:  # %bb.131: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_132: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_134
+; CHECK-NEXT:  # %bb.133: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_134: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_136
+; CHECK-NEXT:  # %bb.135: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_136: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_138
+; CHECK-NEXT:  # %bb.137: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:  .LBB9_138: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_140
+; CHECK-NEXT:  # %bb.139: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_140: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_142
+; CHECK-NEXT:  # %bb.141: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_142: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_144
+; CHECK-NEXT:  # %bb.143: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_144: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_146
+; CHECK-NEXT:  # %bb.145: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_146: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_148
+; CHECK-NEXT:  # %bb.147: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_148: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_150
+; CHECK-NEXT:  # %bb.149: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_150: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_152
+; CHECK-NEXT:  # %bb.151: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_152: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_154
+; CHECK-NEXT:  # %bb.153: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_154: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_156
+; CHECK-NEXT:  # %bb.155: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_156: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_158
+; CHECK-NEXT:  # %bb.157: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_158: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_160
+; CHECK-NEXT:  # %bb.159: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_160: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_162
+; CHECK-NEXT:  # %bb.161: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_162: # %entry
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_164
+; CHECK-NEXT:  # %bb.163: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_164: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_166
+; CHECK-NEXT:  # %bb.165: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_166: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_168
+; CHECK-NEXT:  # %bb.167: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_168: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_170
+; CHECK-NEXT:  # %bb.169: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_170: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_172
+; CHECK-NEXT:  # %bb.171: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_172: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_174
+; CHECK-NEXT:  # %bb.173: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_174: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_176
+; CHECK-NEXT:  # %bb.175: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_176: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_178
+; CHECK-NEXT:  # %bb.177: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_178: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_180
+; CHECK-NEXT:  # %bb.179: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_180: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_182
+; CHECK-NEXT:  # %bb.181: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_182: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_184
+; CHECK-NEXT:  # %bb.183: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_184: # %entry
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_186
+; CHECK-NEXT:  # %bb.185: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_186: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_188
+; CHECK-NEXT:  # %bb.187: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_188: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_190
+; CHECK-NEXT:  # %bb.189: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_190: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_192
+; CHECK-NEXT:  # %bb.191: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_192: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_194
+; CHECK-NEXT:  # %bb.193: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_194: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_196
+; CHECK-NEXT:  # %bb.195: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_196: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_198
+; CHECK-NEXT:  # %bb.197: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:  .LBB9_198: # %entry
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_200
+; CHECK-NEXT:  # %bb.199: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_200: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_202
+; CHECK-NEXT:  # %bb.201: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:  .LBB9_202: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_204
+; CHECK-NEXT:  # %bb.203: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_204: # %entry
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_206
+; CHECK-NEXT:  # %bb.205: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_206: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_208
+; CHECK-NEXT:  # %bb.207: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_208: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_210
+; CHECK-NEXT:  # %bb.209: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_210: # %entry
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_212
+; CHECK-NEXT:  # %bb.211: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ah
+; CHECK-NEXT:  .LBB9_212: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_214
+; CHECK-NEXT:  # %bb.213: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_214: # %entry
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_216
+; CHECK-NEXT:  # %bb.215: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:  .LBB9_216: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_218
+; CHECK-NEXT:  # %bb.217: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:  .LBB9_218: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_220
+; CHECK-NEXT:  # %bb.219: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bh
+; CHECK-NEXT:  .LBB9_220: # %entry
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_222
+; CHECK-NEXT:  # %bb.221: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_222: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    movb %al, %bh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_224
+; CHECK-NEXT:  # %bb.223: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:  .LBB9_224: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_226
+; CHECK-NEXT:  # %bb.225: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_226: # %entry
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_228
+; CHECK-NEXT:  # %bb.227: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_228: # %entry
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_230
+; CHECK-NEXT:  # %bb.229: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:  .LBB9_230: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_232
+; CHECK-NEXT:  # %bb.231: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_232: # %entry
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:    movb %dh, %ah
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_234
+; CHECK-NEXT:  # %bb.233: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:  .LBB9_234: # %entry
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    jae .LBB9_236
+; CHECK-NEXT:  # %bb.235: # %entry
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:  .LBB9_236: # %entry
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_237
+; CHECK-NEXT:  # %bb.238: # %entry
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dh
+; CHECK-NEXT:    jmp .LBB9_239
+; CHECK-NEXT:  .LBB9_237:
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:  .LBB9_239: # %entry
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    cmpl $32, %esi
+; CHECK-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jae .LBB9_240
+; CHECK-NEXT:  # %bb.241: # %entry
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    xorb {{[0-9]+}}(%esp), %dl
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    jmp .LBB9_242
+; CHECK-NEXT:  .LBB9_240:
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:  .LBB9_242: # %entry
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %bl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %bl, %dl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Reload
+; CHECK-NEXT:    addb %bh, %bh
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Reload
+; CHECK-NEXT:    andb $1, %bl
+; CHECK-NEXT:    orb %bh, %bl
+; CHECK-NEXT:    andb $3, %bl
+; CHECK-NEXT:    orb %dl, %bl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
+; CHECK-NEXT:    shlb $3, %dh
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %dh, %dl
+; CHECK-NEXT:    addb %ch, %ch
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    orb %ch, %ah
+; CHECK-NEXT:    andb $3, %ah
+; CHECK-NEXT:    orb %dl, %ah
+; CHECK-NEXT:    shlb $4, %bl
+; CHECK-NEXT:    andb $15, %ah
+; CHECK-NEXT:    orb %bl, %ah
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movb %ah, (%ebp)
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %al, %dl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %dl, %al
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %cl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    andb $3, %ah
+; CHECK-NEXT:    orb %dl, %ah
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movzbl %ah, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %dl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %dl, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %cl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    andb $3, %ah
+; CHECK-NEXT:    orb %dl, %ah
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movzbl %ah, %edi
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %dl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %dl, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %cl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    andb $3, %ah
+; CHECK-NEXT:    orb %dl, %ah
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movzbl %ah, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %dl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %dl, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    shlb $2, %dl
+; CHECK-NEXT:    orb %cl, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    andb $3, %ah
+; CHECK-NEXT:    orb %dl, %ah
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movzbl %ah, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl %ch, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl %ch, %esi
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl %ch, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl %ch, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl %ch, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl %ch, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    shlb $2, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %cl, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %al, %ch
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    addb %al, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %dl
+; CHECK-NEXT:    orb %al, %dl
+; CHECK-NEXT:    andb $3, %dl
+; CHECK-NEXT:    orb %ah, %dl
+; CHECK-NEXT:    shlb $4, %ch
+; CHECK-NEXT:    andb $15, %dl
+; CHECK-NEXT:    orb %ch, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    addb %ch, %ch
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %ch, %al
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    shlb $3, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    shlb $2, %cl
+; CHECK-NEXT:    orb %ah, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    addb %ah, %ah
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %bl
+; CHECK-NEXT:    orb %ah, %bl
+; CHECK-NEXT:    andb $3, %bl
+; CHECK-NEXT:    orb %cl, %bl
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    andb $15, %bl
+; CHECK-NEXT:    orb %al, %bl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    addb %al, %al
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    andb $3, %cl
+; CHECK-NEXT:    orb %ah, %cl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    shlb $3, %al
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %al, %ah
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
+; CHECK-NEXT:    addb %dh, %dh
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ch
+; CHECK-NEXT:    orb %dh, %ch
+; CHECK-NEXT:    andb $3, %ch
+; CHECK-NEXT:    orb %ah, %ch
+; CHECK-NEXT:    shlb $4, %cl
+; CHECK-NEXT:    andb $15, %ch
+; CHECK-NEXT:    orb %cl, %ch
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    shlb $3, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %cl, %ah
+; CHECK-NEXT:    shll $8, %edi
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT:    movw %di, 4(%ebp)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT:    shll $8, %edi
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT:    shll $8, %esi
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT:    shll $16, %edi
+; CHECK-NEXT:    movzwl %si, %esi
+; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    movl %esi, 20(%ebp)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; CHECK-NEXT:    shll $8, %ebp
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    shll $8, %esi
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
+; CHECK-NEXT:    addb %dh, %dh
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    orb %dh, %cl
+; CHECK-NEXT:    andb $3, %cl
+; CHECK-NEXT:    orb %ah, %cl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
+; CHECK-NEXT:    shlb $3, %dh
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ah # 1-byte Reload
+; CHECK-NEXT:    andb $1, %ah
+; CHECK-NEXT:    shlb $2, %ah
+; CHECK-NEXT:    orb %dh, %ah
+; CHECK-NEXT:    movzbl %dl, %esi
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; CHECK-NEXT:    addb %dl, %dl
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    orb %dl, %al
+; CHECK-NEXT:    movzbl %bl, %edx
+; CHECK-NEXT:    andb $3, %al
+; CHECK-NEXT:    orb %ah, %al
+; CHECK-NEXT:    movzbl %ch, %edi
+; CHECK-NEXT:    shlb $4, %cl
+; CHECK-NEXT:    andb $15, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    shll $16, %ebp
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    orl %ebp, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl %eax, 16(%ebx)
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    orl %esi, %edx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %edi, %ecx
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    movzwl %cx, %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    movl %eax, 8(%ebx)
+; CHECK-NEXT:    addl $144, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
                   <8 x i1> %v12, <8 x i1> %v13,
                   <16 x i1> %v22, <16 x i1> %v23,
                   <32 x i1> %v32, <32 x i1> %v33,
diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll
index 4ce131bb86454..eaa32c7e9fba1 100644
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower1.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s 
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2 -o - | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
 
 ; This test checks that only a single jae gets generated in the final code
 ; for lowering the CMOV pseudos that get created for this IR.
@@ -37,3 +38,5 @@ entry:
   ret float %d6
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
index 0a00a72a2dc94..e4fde8ee7f17c 100644
--- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
@@ -15,9 +15,9 @@ define i32 @and_signbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: and_signbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movzbl 6(%esp), %eax
 ; X86-NEXT:    shll $24, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -36,9 +36,9 @@ define i32 @and_nosignbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: and_nosignbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movzbl 6(%esp), %eax
 ; X86-NEXT:    shll $24, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -58,10 +58,10 @@ define i32 @or_signbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: or_signbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl 4(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -80,10 +80,10 @@ define i32 @or_nosignbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: or_nosignbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl 4(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
@@ -103,10 +103,10 @@ define i32 @xor_signbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: xor_signbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    xorl 4(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
@@ -125,10 +125,10 @@ define i32 @xor_nosignbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: xor_nosignbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    xorl 4(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
@@ -148,10 +148,10 @@ define i32 @add_signbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: add_signbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl 4(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    addl $-16777216, %eax # imm = 0xFF000000
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
@@ -170,10 +170,10 @@ define i32 @add_nosignbit_shl(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: add_nosignbit_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl 4(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    addl $-16777216, %eax # imm = 0xFF000000
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
@@ -195,9 +195,9 @@ define i32 @and_signbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: and_signbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movzwl 6(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -216,10 +216,10 @@ define i32 @and_nosignbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: and_nosignbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    andl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -239,10 +239,10 @@ define i32 @or_signbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: or_signbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    orl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -261,10 +261,10 @@ define i32 @or_nosignbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: or_nosignbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    orl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
@@ -284,10 +284,10 @@ define i32 @xor_signbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: xor_signbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    xorl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
@@ -306,10 +306,10 @@ define i32 @xor_nosignbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: xor_nosignbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    xorl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
@@ -329,10 +329,10 @@ define i32 @add_signbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: add_signbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    addl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
@@ -351,10 +351,10 @@ define i32 @add_nosignbit_lshr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: add_nosignbit_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    addl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
@@ -376,9 +376,9 @@ define i32 @and_signbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: and_signbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movswl 6(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -397,10 +397,10 @@ define i32 @and_nosignbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: and_nosignbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    andl 4(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -420,10 +420,10 @@ define i32 @or_signbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: or_signbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    orl 4(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -442,10 +442,10 @@ define i32 @or_nosignbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: or_nosignbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    orl 4(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
@@ -465,10 +465,10 @@ define i32 @xor_signbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: xor_signbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    xorl 4(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
@@ -487,10 +487,10 @@ define i32 @xor_nosignbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: xor_nosignbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    xorl 4(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
@@ -510,10 +510,10 @@ define i32 @add_signbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: add_signbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    addl 4(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
@@ -532,10 +532,10 @@ define i32 @add_nosignbit_ashr(i32 %x, ptr %dst) {
 ;
 ; X86-LABEL: add_nosignbit_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    addl 4(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl 8(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
diff --git a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
index 8c858e04de2a1..1c00d25b2134c 100644
--- a/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-conditional-binop-through-shift.ll
@@ -17,7 +17,6 @@ define i32 @and_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: and_signbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB0_2
@@ -25,6 +24,7 @@ define i32 @and_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:  .LBB0_2:
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -46,7 +46,6 @@ define i32 @and_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: and_nosignbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_2
@@ -54,6 +53,7 @@ define i32 @and_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:  .LBB1_2:
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -76,13 +76,13 @@ define i32 @or_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: or_signbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -104,13 +104,13 @@ define i32 @or_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: or_nosignbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
@@ -133,13 +133,13 @@ define i32 @xor_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: xor_signbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
@@ -161,13 +161,13 @@ define i32 @xor_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: xor_nosignbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
@@ -190,13 +190,13 @@ define i32 @add_signbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: add_signbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
@@ -218,13 +218,13 @@ define i32 @add_nosignbit_select_shl(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: add_nosignbit_select_shl:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
@@ -249,7 +249,6 @@ define i32 @and_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: and_signbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB8_2
@@ -257,6 +256,7 @@ define i32 @and_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:  .LBB8_2:
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -278,7 +278,6 @@ define i32 @and_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: and_nosignbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB9_2
@@ -286,6 +285,7 @@ define i32 @and_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:  .LBB9_2:
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -308,13 +308,13 @@ define i32 @or_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: or_signbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -336,13 +336,13 @@ define i32 @or_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: or_nosignbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
@@ -365,13 +365,13 @@ define i32 @xor_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: xor_signbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
@@ -393,13 +393,13 @@ define i32 @xor_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: xor_nosignbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
@@ -422,13 +422,13 @@ define i32 @add_signbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: add_signbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
@@ -450,13 +450,13 @@ define i32 @add_nosignbit_select_lshr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: add_nosignbit_select_lshr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
@@ -481,7 +481,6 @@ define i32 @and_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: and_signbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB16_2
@@ -489,6 +488,7 @@ define i32 @and_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:  .LBB16_2:
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 4294901760 ; 0xFFFF0000
@@ -510,7 +510,6 @@ define i32 @and_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: and_nosignbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB17_2
@@ -518,6 +517,7 @@ define i32 @and_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:  .LBB17_2:
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, 2147418112 ; 0x7FFF0000
@@ -540,13 +540,13 @@ define i32 @or_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: or_signbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 4294901760 ; 0xFFFF0000
@@ -568,13 +568,13 @@ define i32 @or_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: or_nosignbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = or i32 %x, 2147418112 ; 0x7FFF0000
@@ -597,13 +597,13 @@ define i32 @xor_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: xor_signbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 4294901760 ; 0xFFFF0000
@@ -625,13 +625,13 @@ define i32 @xor_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: xor_nosignbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = xor i32 %x, 2147418112 ; 0x7FFF0000
@@ -654,13 +654,13 @@ define i32 @add_signbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: add_signbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 4294901760 ; 0xFFFF0000
@@ -682,13 +682,13 @@ define i32 @add_nosignbit_select_ashr(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: add_nosignbit_select_ashr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    andl $2147418112, %eax # imm = 0x7FFF0000
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = add i32 %x, 2147418112 ; 0x7FFF0000
@@ -713,13 +713,13 @@ define i32 @shl_signbit_select_add(i32 %x, i1 %cond, ptr %dst) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    negb %cl
 ; X86-NEXT:    andb $4, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    addl $123456, %eax # imm = 0x1E240
-; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = shl i32 %x, 4
   %t1 = select i1 %cond, i32 %t0, i32 %x
@@ -741,7 +741,6 @@ define i32 @shl_signbit_select_add_fail(i32 %x, i1 %cond, ptr %dst) {
 ;
 ; X86-LABEL: shl_signbit_select_add_fail:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB25_2
@@ -749,6 +748,7 @@ define i32 @shl_signbit_select_add_fail(i32 %x, i1 %cond, ptr %dst) {
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:  .LBB25_2:
 ; X86-NEXT:    addl $123456, %eax # imm = 0x1E240
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = shl i32 %x, 4
@@ -774,13 +774,13 @@ define i32 @lshr_signbit_select_add(i32 %x, i1 %cond, ptr %dst, i32 %y) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    negb %cl
 ; X86-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    addl $123456, %eax # imm = 0x1E240
-; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = lshr i32 %x, %y
   %t1 = select i1 %cond, i32 %t0, i32 %x
@@ -804,13 +804,13 @@ define i32 @ashr_signbit_select_add(i32 %x, i1 %cond, ptr %dst) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    negb %cl
 ; X86-NEXT:    andb $4, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    addl $123456, %eax # imm = 0x1E240
-; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = ashr i32 %x, 4
   %t1 = select i1 %cond, i32 %t0, i32 %x
@@ -832,13 +832,13 @@ define i32 @and_signbit_select_add(i32 %x, i1 %cond, ptr %dst, i32 %y) {
 ;
 ; X86-LABEL: and_signbit_select_add:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl $123456, %eax # imm = 0x1E240
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, %y
@@ -862,7 +862,6 @@ define i32 @and_signbit_select_add_fail(i32 %x, i1 %cond, ptr %dst, i32 %y) {
 ;
 ; X86-LABEL: and_signbit_select_add_fail:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB29_2
@@ -870,6 +869,7 @@ define i32 @and_signbit_select_add_fail(i32 %x, i1 %cond, ptr %dst, i32 %y) {
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB29_2:
 ; X86-NEXT:    addl $123456, %eax # imm = 0x1E240
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
   %t0 = and i32 %x, %y
diff --git a/llvm/test/CodeGen/X86/rdrand.ll b/llvm/test/CodeGen/X86/rdrand.ll
index 52f767a4a419c..1b138378af83c 100644
--- a/llvm/test/CodeGen/X86/rdrand.ll
+++ b/llvm/test/CodeGen/X86/rdrand.ll
@@ -8,12 +8,12 @@ declare {i32, i32} @llvm.x86.rdrand.32()
 define i32 @_rdrand16_step(ptr %random_val) {
 ; X86-LABEL: _rdrand16_step:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    rdrandw %ax
-; X86-NEXT:    movzwl %ax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rdrandw %cx
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movzwl %cx, %ecx
 ; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    cmovael %edx, %eax
-; X86-NEXT:    movw %dx, (%ecx)
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: _rdrand16_step:
@@ -34,11 +34,11 @@ define i32 @_rdrand16_step(ptr %random_val) {
 define i32 @_rdrand32_step(ptr %random_val) {
 ; X86-LABEL: _rdrand32_step:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    rdrandl %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rdrandl %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    cmovael %edx, %eax
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: _rdrand32_step:
diff --git a/llvm/test/CodeGen/X86/rdseed.ll b/llvm/test/CodeGen/X86/rdseed.ll
index 5cdd84233cb0f..6ebd28d4549a0 100644
--- a/llvm/test/CodeGen/X86/rdseed.ll
+++ b/llvm/test/CodeGen/X86/rdseed.ll
@@ -8,12 +8,12 @@ declare {i32, i32} @llvm.x86.rdseed.32()
 define i32 @_rdseed16_step(ptr %random_val) {
 ; X86-LABEL: _rdseed16_step:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    rdseedw %ax
-; X86-NEXT:    movzwl %ax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rdseedw %cx
+; X86-NEXT:    movw %cx, (%eax)
+; X86-NEXT:    movzwl %cx, %ecx
 ; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    cmovael %edx, %eax
-; X86-NEXT:    movw %dx, (%ecx)
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: _rdseed16_step:
@@ -34,11 +34,11 @@ define i32 @_rdseed16_step(ptr %random_val) {
 define i32 @_rdseed32_step(ptr %random_val) {
 ; X86-LABEL: _rdseed32_step:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    rdseedl %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rdseedl %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    cmovael %edx, %eax
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: _rdseed32_step:
diff --git a/llvm/test/CodeGen/X86/rdtsc-upgrade.ll b/llvm/test/CodeGen/X86/rdtsc-upgrade.ll
index adf46bd537576..be5025c5854af 100644
--- a/llvm/test/CodeGen/X86/rdtsc-upgrade.ll
+++ b/llvm/test/CodeGen/X86/rdtsc-upgrade.ll
@@ -10,8 +10,8 @@ define i64 @test_builtin_rdtscp(ptr %A) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    rdtscp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, (%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/rdtsc.ll b/llvm/test/CodeGen/X86/rdtsc.ll
index 1efdfd126e3ea..23d00f900a3bc 100644
--- a/llvm/test/CodeGen/X86/rdtsc.ll
+++ b/llvm/test/CodeGen/X86/rdtsc.ll
@@ -46,8 +46,8 @@ define i64 @test_builtin_rdtscp(ptr %A) {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    rdtscp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, (%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
index ea1ca51908134..8f0ca31024dcc 100644
--- a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
+++ b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686 -regalloc=greedy --debug-only=regalloc 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
@@ -15,18 +16,60 @@
 
 ; Make sure the split behaves as expected
 ; CHECK: RS_Split Cascade 1
-; CHECK-NOT: $eax	static = 
+; CHECK-NOT: $eax	static =
 ; CHECK: $eax	no positive bundles
 ; CHECK-NEXT: $ecx	no positive bundles
 ; CHECK-NEXT: $edx	no positive bundles
-; CHECK-NEXT: $esi	static = 
+; CHECK-NEXT: $esi	static =
 ; CHECK-NEXT: $edi	no positive bundles
 ; CHECK-NEXT: $ebx	no positive bundles
-; CHECK-NEXT: $ebp	static = 
+; CHECK-NEXT: $ebp	static =
 ; CHECK: Split for $ebp
 
 ; Function Attrs: nounwind
 define i32 @foo(ptr %array, i32 %cond1, i32 %val) local_unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl (%esi,%eax,4), %ebp
+; CHECK-NEXT:    shll $5, %ebp
+; CHECK-NEXT:    movl 16(%esi), %edi
+; CHECK-NEXT:    movl 12(%esi), %edx
+; CHECK-NEXT:    movl 8(%esi), %ecx
+; CHECK-NEXT:    movl (%esi), %eax
+; CHECK-NEXT:    movl 4(%esi), %ebx
+; CHECK-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    movl %ebp, 24(%esi)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    leal 28(%eax), %ecx
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %if.else
+; CHECK-NEXT:    movl %ebp, 32(%esi)
+; CHECK-NEXT:    movl 20(%esi), %esi
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    leal 36(%eax), %ecx
+; CHECK-NEXT:  .LBB0_3: # %if.end
+; CHECK-NEXT:    movl %ebp, (%ecx)
+; CHECK-NEXT:    addl (%eax), %ebp
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %array.addr = alloca ptr, align 4
   store ptr %array, ptr %array.addr, align 4, !tbaa !3
diff --git a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
index a0bd35d5d219b..8d5357e4b526a 100644
--- a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
+++ b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
@@ -91,10 +91,10 @@ define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
 ;
 ; X86-LABEL: f64_bzhi:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl fill_table64+4(,%eax,8), %edx
-; X86-NEXT:    movl fill_table64(,%eax,8), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl fill_table64(,%ecx,8), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl fill_table64+4(,%ecx,8), %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 entry:
@@ -133,10 +133,10 @@ define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
 ;
 ; X86-LABEL: f64_bzhi_partial:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl fill_table64_partial+4(,%eax,8), %edx
-; X86-NEXT:    movl fill_table64_partial(,%eax,8), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl fill_table64_partial(,%ecx,8), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl fill_table64_partial+4(,%ecx,8), %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/retpoline-external.ll b/llvm/test/CodeGen/X86/retpoline-external.ll
index 8c068bb474782..740a64ba085d0 100644
--- a/llvm/test/CodeGen/X86/retpoline-external.ll
+++ b/llvm/test/CodeGen/X86/retpoline-external.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
 
@@ -8,6 +9,122 @@ declare dso_local void @bar(i32)
 
 ; Test a simple indirect call and tail call.
 define void @icall_reg(ptr %fp, i32 %x) #0 {
+; X64-LABEL: icall_reg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -24
+; X64-NEXT:    .cfi_offset %r14, -16
+; X64-NEXT:    movl %esi, %ebx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movl %esi, %edi
+; X64-NEXT:    callq bar
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    callq __x86_indirect_thunk_r11
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    callq bar
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __x86_indirect_thunk_r11 # TAILCALL
+;
+; X64FAST-LABEL: icall_reg:
+; X64FAST:       # %bb.0: # %entry
+; X64FAST-NEXT:    subq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 32
+; X64FAST-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64FAST-NEXT:    movq %rdi, %rax
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64FAST-NEXT:    callq bar
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64FAST-NEXT:    callq __x86_indirect_thunk_r11
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    callq bar
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64FAST-NEXT:    addq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __x86_indirect_thunk_r11 # TAILCALL
+;
+; X86-LABEL: icall_reg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll bar
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __x86_indirect_thunk_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll bar
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __x86_indirect_thunk_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X86FAST-LABEL: icall_reg:
+; X86FAST:       # %bb.0: # %entry
+; X86FAST-NEXT:    subl $12, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 16
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl %esp, %eax
+; X86FAST-NEXT:    movl %ecx, (%eax)
+; X86FAST-NEXT:    calll bar
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __x86_indirect_thunk_eax
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86FAST-NEXT:    movl %esp, %eax
+; X86FAST-NEXT:    movl %ecx, (%eax)
+; X86FAST-NEXT:    calll bar
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __x86_indirect_thunk_eax
+; X86FAST-NEXT:    addl $12, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 4
+; X86FAST-NEXT:    retl
 entry:
   tail call void @bar(i32 %x)
   tail call void %fp(i32 %x)
@@ -62,6 +179,58 @@ entry:
 
 ; Test an indirect call through a global variable.
 define void @icall_global_fp(i32 %x, ptr %fpp) #0 {
+; X64-LABEL: icall_global_fp:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    .cfi_offset %rbx, -16
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    movq global_fp(%rip), %r11
+; X64-NEXT:    callq __x86_indirect_thunk_r11
+; X64-NEXT:    movq global_fp(%rip), %r11
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __x86_indirect_thunk_r11 # TAILCALL
+;
+; X64FAST-LABEL: icall_global_fp:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    pushq %rax
+; X64FAST-NEXT:    .cfi_def_cfa_offset 16
+; X64FAST-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64FAST-NEXT:    movq global_fp(%rip), %r11
+; X64FAST-NEXT:    callq __x86_indirect_thunk_r11
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq global_fp(%rip), %r11
+; X64FAST-NEXT:    popq %rax
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __x86_indirect_thunk_r11 # TAILCALL
+;
+; X86-LABEL: icall_global_fp:
+; X86:       # %bb.0:
+; X86-NEXT:    movl global_fp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __x86_indirect_thunk_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl global_fp, %eax
+; X86-NEXT:    jmp __x86_indirect_thunk_eax # TAILCALL
+;
+; X86FAST-LABEL: icall_global_fp:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    pushl %eax
+; X86FAST-NEXT:    .cfi_def_cfa_offset 8
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86FAST-NEXT:    movl global_fp, %eax
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __x86_indirect_thunk_eax
+; X86FAST-NEXT:    movl global_fp, %eax
+; X86FAST-NEXT:    popl %ecx
+; X86FAST-NEXT:    .cfi_def_cfa_offset 4
+; X86FAST-NEXT:    jmp __x86_indirect_thunk_eax # TAILCALL
   %fp1 = load ptr, ptr @global_fp
   call void %fp1(i32 %x)
   %fp2 = load ptr, ptr @global_fp
@@ -100,6 +269,80 @@ define void @icall_global_fp(i32 %x, ptr %fpp) #0 {
 
 ; Test an indirect call through a vtable.
 define void @vcall(ptr %obj) #0 {
+; X64-LABEL: vcall:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -24
+; X64-NEXT:    .cfi_offset %r14, -16
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq 8(%rax), %r14
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    callq __x86_indirect_thunk_r11
+; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __x86_indirect_thunk_r11 # TAILCALL
+;
+; X64FAST-LABEL: vcall:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    subq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 32
+; X64FAST-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq (%rdi), %rax
+; X64FAST-NEXT:    movq 8(%rax), %r11
+; X64FAST-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    callq __x86_indirect_thunk_r11
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64FAST-NEXT:    addq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __x86_indirect_thunk_r11 # TAILCALL
+;
+; X86-LABEL: vcall:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __x86_indirect_thunk_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    jmp __x86_indirect_thunk_eax # TAILCALL
+;
+; X86FAST-LABEL: vcall:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    subl $8, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 12
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86FAST-NEXT:    movl (%edx), %eax
+; X86FAST-NEXT:    movl 4(%eax), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __x86_indirect_thunk_eax
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    addl $8, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 4
+; X86FAST-NEXT:    jmp __x86_indirect_thunk_eax # TAILCALL
   %vptr = load ptr, ptr %obj
   %vslot = getelementptr ptr, ptr %vptr, i32 1
   %fp = load ptr, ptr %vslot
@@ -141,6 +384,21 @@ define void @vcall(ptr %obj) #0 {
 declare dso_local void @direct_callee()
 
 define void @direct_tail() #0 {
+; X64-LABEL: direct_tail:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp direct_callee # TAILCALL
+;
+; X64FAST-LABEL: direct_tail:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    jmp direct_callee # TAILCALL
+;
+; X86-LABEL: direct_tail:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp direct_callee # TAILCALL
+;
+; X86FAST-LABEL: direct_tail:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    jmp direct_callee # TAILCALL
   tail call void @direct_callee()
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/retpoline.ll b/llvm/test/CodeGen/X86/retpoline.ll
index ea9c9bc4fae22..938db8ecc785c 100644
--- a/llvm/test/CodeGen/X86/retpoline.ll
+++ b/llvm/test/CodeGen/X86/retpoline.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
 
@@ -8,6 +9,122 @@ declare void @bar(i32)
 
 ; Test a simple indirect call and tail call.
 define void @icall_reg(ptr %fp, i32 %x) #0 {
+; X64-LABEL: icall_reg:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -24
+; X64-NEXT:    .cfi_offset %r14, -16
+; X64-NEXT:    movl %esi, %ebx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movl %esi, %edi
+; X64-NEXT:    callq bar at PLT
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    callq __llvm_retpoline_r11
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    callq bar at PLT
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X64FAST-LABEL: icall_reg:
+; X64FAST:       # %bb.0: # %entry
+; X64FAST-NEXT:    subq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 32
+; X64FAST-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64FAST-NEXT:    movq %rdi, %rax
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64FAST-NEXT:    callq bar at PLT
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64FAST-NEXT:    callq __llvm_retpoline_r11
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    callq bar at PLT
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64FAST-NEXT:    addq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X86-LABEL: icall_reg:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll bar at PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __llvm_retpoline_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll bar at PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __llvm_retpoline_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X86FAST-LABEL: icall_reg:
+; X86FAST:       # %bb.0: # %entry
+; X86FAST-NEXT:    subl $12, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 16
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl %esp, %eax
+; X86FAST-NEXT:    movl %ecx, (%eax)
+; X86FAST-NEXT:    calll bar at PLT
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __llvm_retpoline_eax
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86FAST-NEXT:    movl %esp, %eax
+; X86FAST-NEXT:    movl %ecx, (%eax)
+; X86FAST-NEXT:    calll bar at PLT
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __llvm_retpoline_eax
+; X86FAST-NEXT:    addl $12, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 4
+; X86FAST-NEXT:    retl
 entry:
   tail call void @bar(i32 %x)
   tail call void %fp(i32 %x)
@@ -62,6 +179,58 @@ entry:
 
 ; Test an indirect call through a global variable.
 define void @icall_global_fp(i32 %x, ptr %fpp) #0 {
+; X64-LABEL: icall_global_fp:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    .cfi_offset %rbx, -16
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    movq global_fp(%rip), %r11
+; X64-NEXT:    callq __llvm_retpoline_r11
+; X64-NEXT:    movq global_fp(%rip), %r11
+; X64-NEXT:    movl %ebx, %edi
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X64FAST-LABEL: icall_global_fp:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    pushq %rax
+; X64FAST-NEXT:    .cfi_def_cfa_offset 16
+; X64FAST-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64FAST-NEXT:    movq global_fp(%rip), %r11
+; X64FAST-NEXT:    callq __llvm_retpoline_r11
+; X64FAST-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; X64FAST-NEXT:    movq global_fp(%rip), %r11
+; X64FAST-NEXT:    popq %rax
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X86-LABEL: icall_global_fp:
+; X86:       # %bb.0:
+; X86-NEXT:    movl global_fp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __llvm_retpoline_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl global_fp, %eax
+; X86-NEXT:    jmp __llvm_retpoline_eax # TAILCALL
+;
+; X86FAST-LABEL: icall_global_fp:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    pushl %eax
+; X86FAST-NEXT:    .cfi_def_cfa_offset 8
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86FAST-NEXT:    movl global_fp, %eax
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __llvm_retpoline_eax
+; X86FAST-NEXT:    movl global_fp, %eax
+; X86FAST-NEXT:    popl %ecx
+; X86FAST-NEXT:    .cfi_def_cfa_offset 4
+; X86FAST-NEXT:    jmp __llvm_retpoline_eax # TAILCALL
   %fp1 = load ptr, ptr @global_fp
   call void %fp1(i32 %x)
   %fp2 = load ptr, ptr @global_fp
@@ -100,6 +269,80 @@ define void @icall_global_fp(i32 %x, ptr %fpp) #0 {
 
 ; Test an indirect call through a vtable.
 define void @vcall(ptr %obj) #0 {
+; X64-LABEL: vcall:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -24
+; X64-NEXT:    .cfi_offset %r14, -16
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq 8(%rax), %r14
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    callq __llvm_retpoline_r11
+; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    movq %r14, %r11
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    popq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X64FAST-LABEL: vcall:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    subq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 32
+; X64FAST-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq (%rdi), %rax
+; X64FAST-NEXT:    movq 8(%rax), %r11
+; X64FAST-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    callq __llvm_retpoline_r11
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64FAST-NEXT:    addq $24, %rsp
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X86-LABEL: vcall:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    calll __llvm_retpoline_eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    jmp __llvm_retpoline_eax # TAILCALL
+;
+; X86FAST-LABEL: vcall:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    subl $8, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 12
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86FAST-NEXT:    movl (%edx), %eax
+; X86FAST-NEXT:    movl 4(%eax), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl %esp, %ecx
+; X86FAST-NEXT:    movl %edx, (%ecx)
+; X86FAST-NEXT:    calll __llvm_retpoline_eax
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    addl $8, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 4
+; X86FAST-NEXT:    jmp __llvm_retpoline_eax # TAILCALL
   %vptr = load ptr, ptr %obj
   %vslot = getelementptr ptr, ptr %vptr, i32 1
   %fp = load ptr, ptr %vslot
@@ -141,6 +384,21 @@ define void @vcall(ptr %obj) #0 {
 declare void @direct_callee()
 
 define void @direct_tail() #0 {
+; X64-LABEL: direct_tail:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp direct_callee at PLT # TAILCALL
+;
+; X64FAST-LABEL: direct_tail:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    jmp direct_callee at PLT # TAILCALL
+;
+; X86-LABEL: direct_tail:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp direct_callee at PLT # TAILCALL
+;
+; X86FAST-LABEL: direct_tail:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    jmp direct_callee at PLT # TAILCALL
   tail call void @direct_callee()
   ret void
 }
@@ -158,6 +416,39 @@ define void @direct_tail() #0 {
 declare void @nonlazybind_callee() #2
 
 define void @nonlazybind_caller() #0 {
+; X64-LABEL: nonlazybind_caller:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    .cfi_offset %rbx, -16
+; X64-NEXT:    movq nonlazybind_callee at GOTPCREL(%rip), %rbx
+; X64-NEXT:    movq %rbx, %r11
+; X64-NEXT:    callq __llvm_retpoline_r11
+; X64-NEXT:    movq %rbx, %r11
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X64FAST-LABEL: nonlazybind_caller:
+; X64FAST:       # %bb.0:
+; X64FAST-NEXT:    pushq %rax
+; X64FAST-NEXT:    .cfi_def_cfa_offset 16
+; X64FAST-NEXT:    movq nonlazybind_callee at GOTPCREL(%rip), %r11
+; X64FAST-NEXT:    callq __llvm_retpoline_r11
+; X64FAST-NEXT:    movq nonlazybind_callee at GOTPCREL(%rip), %r11
+; X64FAST-NEXT:    popq %rax
+; X64FAST-NEXT:    .cfi_def_cfa_offset 8
+; X64FAST-NEXT:    jmp __llvm_retpoline_r11 # TAILCALL
+;
+; X86-LABEL: nonlazybind_caller:
+; X86:       # %bb.0:
+; X86-NEXT:    calll nonlazybind_callee at PLT
+; X86-NEXT:    jmp nonlazybind_callee at PLT # TAILCALL
+;
+; X86FAST-LABEL: nonlazybind_caller:
+; X86FAST:       # %bb.0:
+; X86FAST-NEXT:    calll nonlazybind_callee at PLT
+; X86FAST-NEXT:    jmp nonlazybind_callee at PLT # TAILCALL
   call void @nonlazybind_callee()
   tail call void @nonlazybind_callee()
   ret void
@@ -186,9 +477,275 @@ define void @nonlazybind_caller() #0 {
 ; enabled for calls.
 define void @switch_jumptable(ptr %ptr, ptr %sink) #0 {
 ; X64-LABEL: switch_jumptable:
-; X64:         jmpq *
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_8: # %bb5
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $5, (%rsi)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB6_1: # %header
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8, %eax
+; X64-NEXT:    ja .LBB6_3
+; X64-NEXT:  # %bb.2: # %header
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    jmpq *.LJTI6_0(,%rax,8)
+; X64-NEXT:  .LBB6_4: # %bb1
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $1, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_6: # %bb3
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $3, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_7: # %bb4
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $4, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_11: # %bb8
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $8, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_5: # %bb2
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $2, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_9: # %bb6
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $6, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_10: # %bb7
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $7, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_12: # %bb9
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $9, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+; X64-NEXT:  .LBB6_3: # %bb0
+; X64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64-NEXT:    movq $0, (%rsi)
+; X64-NEXT:    jmp .LBB6_1
+;
+; X64FAST-LABEL: switch_jumptable:
+; X64FAST:       # %bb.0: # %entry
+; X64FAST-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_1: # %header
+; X64FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movl (%rax), %eax
+; X64FAST-NEXT:    decl %eax
+; X64FAST-NEXT:    movl %eax, %ecx
+; X64FAST-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    subl $8, %eax
+; X64FAST-NEXT:    ja .LBB6_2
+; X64FAST-NEXT:  # %bb.12: # %header
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq .LJTI6_0(,%rax,8), %rax
+; X64FAST-NEXT:    jmpq *%rax
+; X64FAST-NEXT:  .LBB6_2: # %bb0
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $0, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_3: # %bb1
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $1, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_4: # %bb2
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $2, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_5: # %bb3
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $3, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_6: # %bb4
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $4, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_7: # %bb5
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $5, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_8: # %bb6
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $6, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_9: # %bb7
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $7, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_10: # %bb8
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $8, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+; X64FAST-NEXT:  .LBB6_11: # %bb9
+; X64FAST-NEXT:    # in Loop: Header=BB6_1 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $9, (%rax)
+; X64FAST-NEXT:    jmp .LBB6_1
+;
 ; X86-LABEL: switch_jumptable:
-; X86:         jmpl *
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_8: # %bb5
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $5, (%eax)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB9_1: # %header
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    decl %edx
+; X86-NEXT:    cmpl $8, %edx
+; X86-NEXT:    ja .LBB9_3
+; X86-NEXT:  # %bb.2: # %header
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    jmpl *.LJTI9_0(,%edx,4)
+; X86-NEXT:  .LBB9_4: # %bb1
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $1, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_6: # %bb3
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $3, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_7: # %bb4
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $4, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_11: # %bb8
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $8, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_5: # %bb2
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $2, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_9: # %bb6
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $6, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_10: # %bb7
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $7, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_12: # %bb9
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $9, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+; X86-NEXT:  .LBB9_3: # %bb0
+; X86-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    jmp .LBB9_1
+;
+; X86FAST-LABEL: switch_jumptable:
+; X86FAST:       # %bb.0: # %entry
+; X86FAST-NEXT:    subl $12, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 16
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_1: # %header
+; X86FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl (%eax), %eax
+; X86FAST-NEXT:    decl %eax
+; X86FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86FAST-NEXT:    subl $8, %eax
+; X86FAST-NEXT:    ja .LBB9_2
+; X86FAST-NEXT:  # %bb.12: # %header
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl .LJTI9_0(,%eax,4), %eax
+; X86FAST-NEXT:    jmpl *%eax
+; X86FAST-NEXT:  .LBB9_2: # %bb0
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $0, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_3: # %bb1
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $1, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_4: # %bb2
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $2, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_5: # %bb3
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $3, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_6: # %bb4
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $4, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_7: # %bb5
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $5, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_8: # %bb6
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $6, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_9: # %bb7
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $7, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_10: # %bb8
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $8, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
+; X86FAST-NEXT:  .LBB9_11: # %bb9
+; X86FAST-NEXT:    # in Loop: Header=BB9_1 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $9, (%eax)
+; X86FAST-NEXT:    jmp .LBB9_1
 entry:
   br label %header
 
@@ -262,9 +819,325 @@ bb9:
 ; Check that we preserve indirectbr when only calls are retpolined.
 define void @indirectbr_preserved(ptr readonly %p, ptr %sink) #0 {
 ; X64-LABEL: indirectbr_preserved:
-; X64:         jmpq *
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq indirectbr_preserved.targets at GOTPCREL(%rip), %rax
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp0: # Block address taken
+; X64-NEXT:  .LBB7_1: # %bb0
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $0, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp1: # Block address taken
+; X64-NEXT:  .LBB7_2: # %bb1
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $1, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp2: # Block address taken
+; X64-NEXT:  .LBB7_3: # %bb2
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $2, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp3: # Block address taken
+; X64-NEXT:  .LBB7_4: # %bb3
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $3, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp4: # Block address taken
+; X64-NEXT:  .LBB7_5: # %bb4
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $4, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp5: # Block address taken
+; X64-NEXT:  .LBB7_6: # %bb5
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $5, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp6: # Block address taken
+; X64-NEXT:  .LBB7_7: # %bb6
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $6, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp7: # Block address taken
+; X64-NEXT:  .LBB7_8: # %bb7
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $7, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp8: # Block address taken
+; X64-NEXT:  .LBB7_9: # %bb8
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $8, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .Ltmp9: # Block address taken
+; X64-NEXT:  .LBB7_10: # %bb9
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq $9, (%rsi)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    jmpq *(%rax,%rcx,8)
+;
+; X64FAST-LABEL: indirectbr_preserved:
+; X64FAST:       # %bb.0: # %entry
+; X64FAST-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq (%rdi), %rcx
+; X64FAST-NEXT:    movq indirectbr_preserved.targets at GOTPCREL(%rip), %rax
+; X64FAST-NEXT:    movq (%rax,%rcx,8), %rax
+; X64FAST-NEXT:    jmpq *%rax
+; X64FAST-NEXT:  .Ltmp0: # Block address taken
+; X64FAST-NEXT:  .LBB7_1: # %bb0
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $0, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp1: # Block address taken
+; X64FAST-NEXT:  .LBB7_2: # %bb1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $1, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp2: # Block address taken
+; X64FAST-NEXT:  .LBB7_3: # %bb2
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $2, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp3: # Block address taken
+; X64FAST-NEXT:  .LBB7_4: # %bb3
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $3, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp4: # Block address taken
+; X64FAST-NEXT:  .LBB7_5: # %bb4
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $4, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp5: # Block address taken
+; X64FAST-NEXT:  .LBB7_6: # %bb5
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $5, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp6: # Block address taken
+; X64FAST-NEXT:  .LBB7_7: # %bb6
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $6, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp7: # Block address taken
+; X64FAST-NEXT:  .LBB7_8: # %bb7
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $7, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp8: # Block address taken
+; X64FAST-NEXT:  .LBB7_9: # %bb8
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $8, (%rax)
+; X64FAST-NEXT:    jmp .LBB7_11
+; X64FAST-NEXT:  .Ltmp9: # Block address taken
+; X64FAST-NEXT:  .LBB7_10: # %bb9
+; X64FAST-NEXT:    # in Loop: Header=BB7_11 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $9, (%rax)
+; X64FAST-NEXT:  .LBB7_11: # %latch
+; X64FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq (%rax), %rcx
+; X64FAST-NEXT:    movq indirectbr_preserved.targets at GOTPCREL(%rip), %rax
+; X64FAST-NEXT:    movq (%rax,%rcx,8), %rax
+; X64FAST-NEXT:    jmpq *%rax
+;
 ; X86-LABEL: indirectbr_preserved:
-; X86:         jmpl *
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp0: # Block address taken
+; X86-NEXT:  .LBB10_1: # %bb0
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp1: # Block address taken
+; X86-NEXT:  .LBB10_2: # %bb1
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $1, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp2: # Block address taken
+; X86-NEXT:  .LBB10_3: # %bb2
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $2, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp3: # Block address taken
+; X86-NEXT:  .LBB10_4: # %bb3
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $3, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp4: # Block address taken
+; X86-NEXT:  .LBB10_5: # %bb4
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $4, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp5: # Block address taken
+; X86-NEXT:  .LBB10_6: # %bb5
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $5, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp6: # Block address taken
+; X86-NEXT:  .LBB10_7: # %bb6
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $6, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp7: # Block address taken
+; X86-NEXT:  .LBB10_8: # %bb7
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $7, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp8: # Block address taken
+; X86-NEXT:  .LBB10_9: # %bb8
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $8, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .Ltmp9: # Block address taken
+; X86-NEXT:  .LBB10_10: # %bb9
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $9, (%ecx)
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    jmpl *indirectbr_preserved.targets(,%edx,4)
+;
+; X86FAST-LABEL: indirectbr_preserved:
+; X86FAST:       # %bb.0: # %entry
+; X86FAST-NEXT:    subl $8, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 12
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl (%eax), %eax
+; X86FAST-NEXT:    movl indirectbr_preserved.targets(,%eax,4), %eax
+; X86FAST-NEXT:    jmpl *%eax
+; X86FAST-NEXT:  .Ltmp0: # Block address taken
+; X86FAST-NEXT:  .LBB10_1: # %bb0
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $0, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp1: # Block address taken
+; X86FAST-NEXT:  .LBB10_2: # %bb1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $1, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp2: # Block address taken
+; X86FAST-NEXT:  .LBB10_3: # %bb2
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $2, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp3: # Block address taken
+; X86FAST-NEXT:  .LBB10_4: # %bb3
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $3, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp4: # Block address taken
+; X86FAST-NEXT:  .LBB10_5: # %bb4
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $4, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp5: # Block address taken
+; X86FAST-NEXT:  .LBB10_6: # %bb5
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $5, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp6: # Block address taken
+; X86FAST-NEXT:  .LBB10_7: # %bb6
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $6, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp7: # Block address taken
+; X86FAST-NEXT:  .LBB10_8: # %bb7
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $7, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp8: # Block address taken
+; X86FAST-NEXT:  .LBB10_9: # %bb8
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $8, (%eax)
+; X86FAST-NEXT:    jmp .LBB10_11
+; X86FAST-NEXT:  .Ltmp9: # Block address taken
+; X86FAST-NEXT:  .LBB10_10: # %bb9
+; X86FAST-NEXT:    # in Loop: Header=BB10_11 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $9, (%eax)
+; X86FAST-NEXT:  .LBB10_11: # %latch
+; X86FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl (%eax), %eax
+; X86FAST-NEXT:    movl indirectbr_preserved.targets(,%eax,4), %eax
+; X86FAST-NEXT:    jmpl *%eax
 entry:
   %i0 = load i64, ptr %p
   %target.i0 = getelementptr [10 x ptr], ptr @indirectbr_preserved.targets, i64 0, i64 %i0
@@ -345,9 +1218,512 @@ latch:
 ; as a jump table.
 define void @indirectbr_rewrite(ptr readonly %p, ptr %sink) #1 {
 ; X64-LABEL: indirectbr_rewrite:
-; X64-NOT:     jmpq
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq indirectbr_rewrite.targets at GOTPCREL(%rip), %rax
+; X64-NEXT:    jmp .LBB8_8
+; X64-NEXT:  .LBB8_2: # %bb3
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $3, (%rsi)
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB8_7: # %latch
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:  .LBB8_8: # %switch_bb
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movq (%rax,%rcx,8), %rcx
+; X64-NEXT:    cmpq $5, %rcx
+; X64-NEXT:    jle .LBB8_9
+; X64-NEXT:  # %bb.16: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $7, %rcx
+; X64-NEXT:    jle .LBB8_17
+; X64-NEXT:  # %bb.20: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $8, %rcx
+; X64-NEXT:    je .LBB8_4
+; X64-NEXT:  # %bb.21: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $9, %rcx
+; X64-NEXT:    je .LBB8_5
+; X64-NEXT:  # %bb.22: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $10, %rcx
+; X64-NEXT:    jne .LBB8_23
+; X64-NEXT:  # %bb.6: # %bb9
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $9, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB8_9: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $3, %rcx
+; X64-NEXT:    jg .LBB8_13
+; X64-NEXT:  # %bb.10: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $2, %rcx
+; X64-NEXT:    je .LBB8_1
+; X64-NEXT:  # %bb.11: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $3, %rcx
+; X64-NEXT:    jne .LBB8_23
+; X64-NEXT:  # %bb.12: # %bb2
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $2, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB8_13: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $4, %rcx
+; X64-NEXT:    je .LBB8_2
+; X64-NEXT:  # %bb.14: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $5, %rcx
+; X64-NEXT:    jne .LBB8_23
+; X64-NEXT:  # %bb.15: # %bb4
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $4, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB8_17: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $6, %rcx
+; X64-NEXT:    je .LBB8_3
+; X64-NEXT:  # %bb.18: # %switch_bb
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    cmpq $7, %rcx
+; X64-NEXT:    jne .LBB8_23
+; X64-NEXT:  # %bb.19: # %bb6
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $6, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:  .LBB8_1: # %bb1
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $1, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:  .LBB8_4: # %bb7
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $7, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:  .LBB8_3: # %bb5
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $5, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:  .LBB8_5: # %bb8
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $8, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:  .LBB8_23: # %bb0
+; X64-NEXT:    # in Loop: Header=BB8_8 Depth=1
+; X64-NEXT:    movq $0, (%rsi)
+; X64-NEXT:    jmp .LBB8_7
+; X64-NEXT:  .Ltmp10: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp11: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp12: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp13: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp14: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp15: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp16: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp17: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp18: # Address of block that was removed by CodeGen
+; X64-NEXT:  .Ltmp19: # Address of block that was removed by CodeGen
+;
+; X64FAST-LABEL: indirectbr_rewrite:
+; X64FAST:       # %bb.0: # %entry
+; X64FAST-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    movq (%rdi), %rcx
+; X64FAST-NEXT:    movq indirectbr_rewrite.targets at GOTPCREL(%rip), %rax
+; X64FAST-NEXT:    movq (%rax,%rcx,8), %rax
+; X64FAST-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    jmp .LBB8_12
+; X64FAST-NEXT:  .LBB8_1: # %bb0
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $0, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_2: # %bb1
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $1, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_3: # %bb2
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $2, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_4: # %bb3
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $3, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_5: # %bb4
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $4, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_6: # %bb5
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $5, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_7: # %bb6
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $6, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_8: # %bb7
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $7, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_9: # %bb8
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $8, (%rax)
+; X64FAST-NEXT:    jmp .LBB8_11
+; X64FAST-NEXT:  .LBB8_10: # %bb9
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq $9, (%rax)
+; X64FAST-NEXT:  .LBB8_11: # %latch
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq (%rax), %rcx
+; X64FAST-NEXT:    movq indirectbr_rewrite.targets at GOTPCREL(%rip), %rax
+; X64FAST-NEXT:    movq (%rax,%rcx,8), %rax
+; X64FAST-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:  .LBB8_12: # %switch_bb
+; X64FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64FAST-NEXT:    subq $2, %rax
+; X64FAST-NEXT:    je .LBB8_2
+; X64FAST-NEXT:    jmp .LBB8_13
+; X64FAST-NEXT:  .LBB8_13: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $3, %rax
+; X64FAST-NEXT:    je .LBB8_3
+; X64FAST-NEXT:    jmp .LBB8_14
+; X64FAST-NEXT:  .LBB8_14: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $4, %rax
+; X64FAST-NEXT:    je .LBB8_4
+; X64FAST-NEXT:    jmp .LBB8_15
+; X64FAST-NEXT:  .LBB8_15: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $5, %rax
+; X64FAST-NEXT:    je .LBB8_5
+; X64FAST-NEXT:    jmp .LBB8_16
+; X64FAST-NEXT:  .LBB8_16: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $6, %rax
+; X64FAST-NEXT:    je .LBB8_6
+; X64FAST-NEXT:    jmp .LBB8_17
+; X64FAST-NEXT:  .LBB8_17: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $7, %rax
+; X64FAST-NEXT:    je .LBB8_7
+; X64FAST-NEXT:    jmp .LBB8_18
+; X64FAST-NEXT:  .LBB8_18: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $8, %rax
+; X64FAST-NEXT:    je .LBB8_8
+; X64FAST-NEXT:    jmp .LBB8_19
+; X64FAST-NEXT:  .LBB8_19: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $9, %rax
+; X64FAST-NEXT:    je .LBB8_9
+; X64FAST-NEXT:    jmp .LBB8_20
+; X64FAST-NEXT:  .LBB8_20: # %switch_bb
+; X64FAST-NEXT:    # in Loop: Header=BB8_12 Depth=1
+; X64FAST-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64FAST-NEXT:    subq $10, %rax
+; X64FAST-NEXT:    je .LBB8_10
+; X64FAST-NEXT:    jmp .LBB8_1
+; X64FAST-NEXT:  .Ltmp10: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp11: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp12: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp13: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp14: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp15: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp16: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp17: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp18: # Address of block that was removed by CodeGen
+; X64FAST-NEXT:  .Ltmp19: # Address of block that was removed by CodeGen
+;
 ; X86-LABEL: indirectbr_rewrite:
-; X86-NOT:     jmpl
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl indirectbr_rewrite.targets(,%ecx,4), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    jmp .LBB11_8
+; X86-NEXT:  .LBB11_2: # %bb3
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $3, (%ecx)
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB11_7: # %latch
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl indirectbr_rewrite.targets(,%edx,4), %edx
+; X86-NEXT:  .LBB11_8: # %switch_bb
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    cmpl $5, %edx
+; X86-NEXT:    jle .LBB11_9
+; X86-NEXT:  # %bb.16: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $7, %edx
+; X86-NEXT:    jle .LBB11_17
+; X86-NEXT:  # %bb.20: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $8, %edx
+; X86-NEXT:    je .LBB11_4
+; X86-NEXT:  # %bb.21: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $9, %edx
+; X86-NEXT:    je .LBB11_5
+; X86-NEXT:  # %bb.22: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $10, %edx
+; X86-NEXT:    jne .LBB11_23
+; X86-NEXT:  # %bb.6: # %bb9
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $9, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB11_9: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    jg .LBB11_13
+; X86-NEXT:  # %bb.10: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $2, %edx
+; X86-NEXT:    je .LBB11_1
+; X86-NEXT:  # %bb.11: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $3, %edx
+; X86-NEXT:    jne .LBB11_23
+; X86-NEXT:  # %bb.12: # %bb2
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $2, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB11_13: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $4, %edx
+; X86-NEXT:    je .LBB11_2
+; X86-NEXT:  # %bb.14: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $5, %edx
+; X86-NEXT:    jne .LBB11_23
+; X86-NEXT:  # %bb.15: # %bb4
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $4, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB11_17: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $6, %edx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  # %bb.18: # %switch_bb
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    cmpl $7, %edx
+; X86-NEXT:    jne .LBB11_23
+; X86-NEXT:  # %bb.19: # %bb6
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $6, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:  .LBB11_1: # %bb1
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $1, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:  .LBB11_4: # %bb7
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $7, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:  .LBB11_3: # %bb5
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $5, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:  .LBB11_5: # %bb8
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $8, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:  .LBB11_23: # %bb0
+; X86-NEXT:    # in Loop: Header=BB11_8 Depth=1
+; X86-NEXT:    movl $0, 4(%ecx)
+; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    jmp .LBB11_7
+; X86-NEXT:  .Ltmp10: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp11: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp12: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp13: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp14: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp15: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp16: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp17: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp18: # Address of block that was removed by CodeGen
+; X86-NEXT:  .Ltmp19: # Address of block that was removed by CodeGen
+;
+; X86FAST-LABEL: indirectbr_rewrite:
+; X86FAST:       # %bb.0: # %entry
+; X86FAST-NEXT:    subl $16, %esp
+; X86FAST-NEXT:    .cfi_def_cfa_offset 20
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    movl (%eax), %eax
+; X86FAST-NEXT:    movl indirectbr_rewrite.targets(,%eax,4), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:    jmp .LBB11_12
+; X86FAST-NEXT:  .LBB11_1: # %bb0
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $0, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_2: # %bb1
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $1, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_3: # %bb2
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $2, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_4: # %bb3
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $3, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_5: # %bb4
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $4, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_6: # %bb5
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $5, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_7: # %bb6
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $6, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_8: # %bb7
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $7, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_9: # %bb8
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $8, (%eax)
+; X86FAST-NEXT:    jmp .LBB11_11
+; X86FAST-NEXT:  .LBB11_10: # %bb9
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl $0, 4(%eax)
+; X86FAST-NEXT:    movl $9, (%eax)
+; X86FAST-NEXT:  .LBB11_11: # %latch
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl (%eax), %eax
+; X86FAST-NEXT:    movl indirectbr_rewrite.targets(,%eax,4), %eax
+; X86FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86FAST-NEXT:  .LBB11_12: # %switch_bb
+; X86FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86FAST-NEXT:    subl $2, %eax
+; X86FAST-NEXT:    je .LBB11_2
+; X86FAST-NEXT:    jmp .LBB11_13
+; X86FAST-NEXT:  .LBB11_13: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $3, %eax
+; X86FAST-NEXT:    je .LBB11_3
+; X86FAST-NEXT:    jmp .LBB11_14
+; X86FAST-NEXT:  .LBB11_14: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $4, %eax
+; X86FAST-NEXT:    je .LBB11_4
+; X86FAST-NEXT:    jmp .LBB11_15
+; X86FAST-NEXT:  .LBB11_15: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $5, %eax
+; X86FAST-NEXT:    je .LBB11_5
+; X86FAST-NEXT:    jmp .LBB11_16
+; X86FAST-NEXT:  .LBB11_16: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $6, %eax
+; X86FAST-NEXT:    je .LBB11_6
+; X86FAST-NEXT:    jmp .LBB11_17
+; X86FAST-NEXT:  .LBB11_17: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $7, %eax
+; X86FAST-NEXT:    je .LBB11_7
+; X86FAST-NEXT:    jmp .LBB11_18
+; X86FAST-NEXT:  .LBB11_18: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $8, %eax
+; X86FAST-NEXT:    je .LBB11_8
+; X86FAST-NEXT:    jmp .LBB11_19
+; X86FAST-NEXT:  .LBB11_19: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $9, %eax
+; X86FAST-NEXT:    je .LBB11_9
+; X86FAST-NEXT:    jmp .LBB11_20
+; X86FAST-NEXT:  .LBB11_20: # %switch_bb
+; X86FAST-NEXT:    # in Loop: Header=BB11_12 Depth=1
+; X86FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86FAST-NEXT:    subl $10, %eax
+; X86FAST-NEXT:    je .LBB11_10
+; X86FAST-NEXT:    jmp .LBB11_1
+; X86FAST-NEXT:  .Ltmp10: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp11: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp12: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp13: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp14: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp15: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp16: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp17: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp18: # Address of block that was removed by CodeGen
+; X86FAST-NEXT:  .Ltmp19: # Address of block that was removed by CodeGen
 entry:
   %i0 = load i64, ptr %p
   %target.i0 = getelementptr [10 x ptr], ptr @indirectbr_rewrite.targets, i64 0, i64 %i0
diff --git a/llvm/test/CodeGen/X86/rev16.ll b/llvm/test/CodeGen/X86/rev16.ll
index ecb1970185c2d..0f2b31d1ae4cd 100644
--- a/llvm/test/CodeGen/X86/rev16.ll
+++ b/llvm/test/CodeGen/X86/rev16.ll
@@ -29,12 +29,12 @@ define i32 @rev16(i32 %a) {
 define i32 @not_rev16(i32 %a) {
 ; X86-LABEL: not_rev16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    shrl $8, %ecx
-; X86-NEXT:    andl $65280, %ecx # imm = 0xFF00
-; X86-NEXT:    andl $16711680, %eax # imm = 0xFF0000
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    andl $16711680, %ecx # imm = 0xFF0000
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    andl $65280, %eax # imm = 0xFF00
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -61,8 +61,8 @@ define i32 @extra_maskop_uses2(i32 %a) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shll $8, %ecx
-; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    andl $-16711936, %ecx # imm = 0xFF00FF00
+; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    andl $16711935, %eax # imm = 0xFF00FF
 ; X86-NEXT:    leal (%eax,%ecx), %edx
 ; X86-NEXT:    imull %ecx, %eax
@@ -119,10 +119,10 @@ define i32 @different_shift_amount(i32 %a) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll $9, %ecx
-; X86-NEXT:    shrl $8, %eax
-; X86-NEXT:    andl $-16712192, %ecx # imm = 0xFF00FE00
-; X86-NEXT:    andl $16711935, %eax # imm = 0xFF00FF
+; X86-NEXT:    shrl $8, %ecx
+; X86-NEXT:    andl $16711935, %ecx # imm = 0xFF00FF
+; X86-NEXT:    shll $9, %eax
+; X86-NEXT:    andl $-16712192, %eax # imm = 0xFF00FE00
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -170,10 +170,10 @@ define i32 @different_op(i32 %a) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll $8, %ecx
-; X86-NEXT:    shrl $8, %eax
-; X86-NEXT:    addl $16711936, %ecx # imm = 0xFF0100
-; X86-NEXT:    andl $16711935, %eax # imm = 0xFF00FF
+; X86-NEXT:    shrl $8, %ecx
+; X86-NEXT:    andl $16711935, %ecx # imm = 0xFF00FF
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    addl $16711936, %eax # imm = 0xFF0100
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -197,12 +197,12 @@ define i32 @different_op(i32 %a) {
 define i32 @different_vars(i32 %a, i32 %b) {
 ; X86-LABEL: different_vars:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $8, %ecx
-; X86-NEXT:    shrl $8, %eax
-; X86-NEXT:    andl $-16711936, %ecx # imm = 0xFF00FF00
-; X86-NEXT:    andl $16711935, %eax # imm = 0xFF00FF
+; X86-NEXT:    shrl $8, %ecx
+; X86-NEXT:    andl $16711935, %ecx # imm = 0xFF00FF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    andl $-16711936, %eax # imm = 0xFF00FF00
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/rot16.ll b/llvm/test/CodeGen/X86/rot16.ll
index c7c2d33d98922..8899d5c5263ce 100644
--- a/llvm/test/CodeGen/X86/rot16.ll
+++ b/llvm/test/CodeGen/X86/rot16.ll
@@ -30,10 +30,10 @@ define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind {
 define i16 @bar(i16 %x, i16 %y, i16 %z) nounwind {
 ; X86-LABEL: bar:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shldw %cl, %dx, %ax
 ; X86-NEXT:    retl
 ;
@@ -79,10 +79,10 @@ define i16 @un(i16 %x, i16 %y, i16 %z) nounwind {
 define i16 @bu(i16 %x, i16 %y, i16 %z) nounwind {
 ; X86-LABEL: bu:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrdw %cl, %dx, %ax
 ; X86-NEXT:    retl
 ;
@@ -185,8 +185,8 @@ define i32 @rot16_demandedbits(i32 %x, i32 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shrl $11, %ecx
-; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    shrl $11, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    retl
@@ -211,8 +211,8 @@ define i16 @rot16_trunc(i32 %x, i32 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shrl $11, %ecx
-; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    shrl $11, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
@@ -292,18 +292,18 @@ define void @rotate16_memory(ptr %p, ptr %q) {
 ; X86-BASE-LABEL: rotate16_memory:
 ; X86-BASE:       # %bb.0:
 ; X86-BASE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BASE-NEXT:    movzwl (%eax), %eax
+; X86-BASE-NEXT:    rolw $8, %ax
 ; X86-BASE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BASE-NEXT:    movzwl (%ecx), %ecx
-; X86-BASE-NEXT:    rolw $8, %cx
-; X86-BASE-NEXT:    movw %cx, (%eax)
+; X86-BASE-NEXT:    movw %ax, (%ecx)
 ; X86-BASE-NEXT:    retl
 ;
 ; X86-MOVBE-LABEL: rotate16_memory:
 ; X86-MOVBE:       # %bb.0:
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-MOVBE-NEXT:    movzwl (%eax), %eax
 ; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movzwl (%ecx), %ecx
-; X86-MOVBE-NEXT:    movbew %cx, (%eax)
+; X86-MOVBE-NEXT:    movbew %ax, (%ecx)
 ; X86-MOVBE-NEXT:    retl
 ;
 ; X64-BASE-LABEL: rotate16_memory:
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index fdf43f5ab11f9..824cf5fab41ef 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -149,12 +149,19 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
 
 ; Result would undershift
 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
-; CHECK-LABEL: no_extract_shl:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsllq $24, %ymm0, %ymm1
-; CHECK-NEXT:    vpsrlq $39, %ymm0, %ymm0
-; CHECK-NEXT:    vpternlogq {{.*#+}} ymm0 = (ymm0 & m64bcst) | ymm1
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: no_extract_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrlq $39, %ymm0, %ymm1
+; X86-NEXT:    vpsllq $24, %ymm0, %ymm0
+; X86-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm1 & m64bcst)
+; X86-NEXT:    retl
+;
+; X64-LABEL: no_extract_shl:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllq $24, %ymm0, %ymm1
+; X64-NEXT:    vpsrlq $39, %ymm0, %ymm0
+; X64-NEXT:    vpternlogq {{.*#+}} ymm0 = (ymm0 & m64bcst) | ymm1
+; X64-NEXT:    retq
   %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
   %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
   %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
@@ -164,12 +171,19 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
 
 ; Result would overshift
 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
-; CHECK-LABEL: no_extract_shrl:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrld $9, %xmm0, %xmm1
-; CHECK-NEXT:    vpslld $25, %xmm0, %xmm0
-; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: no_extract_shrl:
+; X86:       # %bb.0:
+; X86-NEXT:    vpslld $25, %xmm0, %xmm1
+; X86-NEXT:    vpsrld $9, %xmm0, %xmm0
+; X86-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 | (xmm1 & m32bcst)
+; X86-NEXT:    retl
+;
+; X64-LABEL: no_extract_shrl:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrld $9, %xmm0, %xmm1
+; X64-NEXT:    vpslld $25, %xmm0, %xmm0
+; X64-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
+; X64-NEXT:    retq
   %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
   %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
   %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
@@ -181,11 +195,11 @@ define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
 ; X86-LABEL: no_extract_mul:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
-; X86-NEXT:    vpslld $3, %ymm0, %ymm2
-; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X86-NEXT:    vpsrld $23, %ymm0, %ymm0
-; X86-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; X86-NEXT:    vpslld $3, %ymm0, %ymm1
+; X86-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vpsrld $23, %ymm1, %ymm1
+; X86-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
+; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: no_extract_mul:
@@ -207,6 +221,10 @@ define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
 ; X86-LABEL: no_extract_udiv:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
@@ -214,19 +232,16 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivdi3
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
-; X86-NEXT:    vmovd %eax, %xmm0
-; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
 ; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
-; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovss %xmm0, (%esp)
@@ -237,17 +252,26 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
 ; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vextractps $2, %xmm0, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
+; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovd %eax, %xmm0
 ; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0
+; X86-NEXT:    vpinsrd $3, %ebp, %xmm0, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovd %esi, %xmm0
+; X86-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
 ; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
-; X86-NEXT:    vpsllq $56, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vpsllq $56, %xmm0, %xmm0
+; X86-NEXT:    vpor {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; X86-NEXT:    addl $48, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: no_extract_udiv:
@@ -306,12 +330,19 @@ define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind {
 }
 
 define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind {
-; CHECK-LABEL: no_extract_add_1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpsrld $27, %xmm0, %xmm0
-; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: no_extract_add_1:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrld $27, %xmm0, %xmm1
+; X86-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: no_extract_add_1:
+; X64:       # %bb.0:
+; X64-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
+; X64-NEXT:    vpsrld $27, %xmm0, %xmm0
+; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
   %ii = add <4 x i32> %i, %i
   %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27>
   %out = or <4 x i32> %ii, %rhs
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index b5332068d7edd..f994e675add40 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -134,8 +134,8 @@ define i64 @no_extract_shl(i64 %i) nounwind {
 ; X86-LABEL: no_extract_shl:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shldl $10, %ecx, %edx
 ; X86-NEXT:    shll $10, %ecx
 ; X86-NEXT:    shrl $20, %eax
@@ -222,17 +222,18 @@ define i16 @no_extract_mul(i16 %i) nounwind {
 define i8 @no_extract_udiv(i8 %i) nounwind {
 ; X86-LABEL: no_extract_udiv:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $171, %eax, %ecx
-; X86-NEXT:    imull $79, %eax, %edx
-; X86-NEXT:    subb %dh, %al
-; X86-NEXT:    shrb %al
-; X86-NEXT:    addb %dh, %al
-; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    shlb $3, %ch
-; X86-NEXT:    orb %al, %ch
-; X86-NEXT:    andb $-9, %ch
-; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull $79, %ecx, %eax
+; X86-NEXT:    imull $171, %ecx, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    subb %ah, %cl
+; X86-NEXT:    shrb %cl
+; X86-NEXT:    addb %ah, %cl
+; X86-NEXT:    shrb $5, %cl
+; X86-NEXT:    shlb $3, %dh
+; X86-NEXT:    orb %cl, %dh
+; X86-NEXT:    andb $-9, %dh
+; X86-NEXT:    movb %dh, %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: no_extract_udiv:
@@ -299,8 +300,9 @@ define i32 @no_extract_add_1(i32 %i) nounwind {
 ; X86-LABEL: no_extract_add_1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax), %ecx
-; X86-NEXT:    shrl $27, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrl $27, %ecx
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll
index ea32edba62822..15dda6d62b294 100644
--- a/llvm/test/CodeGen/X86/rotate.ll
+++ b/llvm/test/CodeGen/X86/rotate.ll
@@ -8,33 +8,33 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb $64, %cl
+; X86-NEXT:    subb %ch, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB0_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    movb $64, %ch
-; X86-NEXT:    subb %cl, %ch
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    shldl %cl, %ebx, %edx
 ; X86-NEXT:    testb $32, %ch
 ; X86-NEXT:    je .LBB0_4
 ; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB0_4:
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -569,11 +569,11 @@ define void @rotr1_64_mem(ptr %Aptr) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %ecx
 ; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shldl $31, %edx, %esi
-; X86-NEXT:    shldl $31, %ecx, %edx
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shldl $31, %ecx, %esi
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -653,29 +653,29 @@ define i64 @truncated_rot(i64 %x, i32 %amt) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb $64, %cl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    jne .LBB28_2
 ; X86-NEXT:  # %bb.1: # %entry
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:  .LBB28_2: # %entry
-; X86-NEXT:    movb $64, %dl
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    testb $32, %dl
+; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    jne .LBB28_4
 ; X86-NEXT:  # %bb.3: # %entry
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:  .LBB28_4: # %entry
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll
index 0cc9f465dd75a..f08bc4dfe2b72 100644
--- a/llvm/test/CodeGen/X86/rotate4.ll
+++ b/llvm/test/CodeGen/X86/rotate4.ll
@@ -65,31 +65,33 @@ define i64 @rotate_left_64(i64 %a, i64 %b) {
 ; X86-NEXT:    .cfi_offset %esi, -16
 ; X86-NEXT:    .cfi_offset %edi, -12
 ; X86-NEXT:    .cfi_offset %ebx, -8
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB2_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:  .LBB2_2:
-; X86-NEXT:    negb %cl
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    shrdl %cl, %edi, %esi
-; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    testb $32, %ch
 ; X86-NEXT:    je .LBB2_4
 ; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB2_4:
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %edi
@@ -242,34 +244,37 @@ define void @rotate_left_m64(ptr%pa, i64 %b) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %esi
-; X86-NEXT:    movl 4(%eax), %ebx
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %ebx
+; X86-NEXT:    movl 4(%edx), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB6_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    negb %cl
 ; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll %cl, %ebp
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    testb $32, %al
 ; X86-NEXT:    je .LBB6_4
 ; X86-NEXT:  # %bb.3:
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    xorl %ebp, %ebp
 ; X86-NEXT:  .LBB6_4:
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    orl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
@@ -312,10 +317,10 @@ define void @rotate_right_m64(ptr%pa, i64 %b) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %ebx
 ; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    movl %ebx, %edi
@@ -336,8 +341,8 @@ define void @rotate_right_m64(ptr%pa, i64 %b) {
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    xorl %ebp, %ebp
 ; X86-NEXT:  .LBB7_4:
-; X86-NEXT:    orl %ebp, %edi
 ; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %ebp, %edi
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    popl %esi
@@ -581,9 +586,9 @@ define void @rotate_right_m16(ptr %p, i32 %amount) {
 define i32 @rotate_demanded_bits(i32, i32) {
 ; X86-LABEL: rotate_demanded_bits:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $30, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -607,9 +612,9 @@ define i32 @rotate_demanded_bits(i32, i32) {
 define i32 @rotate_demanded_bits_2(i32, i32) {
 ; X86-LABEL: rotate_demanded_bits_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $23, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    roll %cl, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll
index 5b9a42d1f0d91..a9a8f7b229def 100644
--- a/llvm/test/CodeGen/X86/sadd_sat.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat.ll
@@ -12,8 +12,8 @@ declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal (%eax,%ecx), %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    addl $-2147483648, %edx # imm = 0x80000000
@@ -70,8 +70,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addw %cx, %dx
 ; X86-NEXT:    movswl %dx, %edx
@@ -103,12 +103,12 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addb %cl, %dl
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    addb %al, %dl
 ; X86-NEXT:    sarb $7, %dl
 ; X86-NEXT:    addb $-128, %dl
-; X86-NEXT:    addb %cl, %al
-; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    movzbl %dl, %eax
 ; X86-NEXT:    cmovnol %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -134,14 +134,14 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    cmpb $7, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    cmovll %ecx, %eax
-; X86-NEXT:    cmpb $-7, %al
+; X86-NEXT:    cmpb $7, %cl
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    movl $248, %ecx
+; X86-NEXT:    cmpb $-7, %al
 ; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    movsbl %cl, %eax
 ; X86-NEXT:    retl
@@ -168,39 +168,39 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%eax), %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    cmovol %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%ecx,%eax), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    leal (%edx,%eax), %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    addl $-2147483648, %esi # imm = 0x80000000
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovol %esi, %ecx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    cmovol %esi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%edx,%eax), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    leal (%esi,%eax), %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    addl $-2147483648, %edi # imm = 0x80000000
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovol %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%esi,%eax), %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    addl $-2147483648, %ebx # imm = 0x80000000
 ; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovol %ebx, %esi
+; X86-NEXT:    cmovol %edi, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    leal (%edi,%eax), %ebx
 ; X86-NEXT:    sarl $31, %ebx
 ; X86-NEXT:    addl $-2147483648, %ebx # imm = 0x80000000
 ; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmovol %ebx, %edi
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sadd_sat_plus.ll b/llvm/test/CodeGen/X86/sadd_sat_plus.ll
index deabeb27cdab8..e6afb765cbbbe 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_plus.ll
@@ -107,9 +107,9 @@ define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounw
 define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind {
 ; X86-LABEL: func8:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mulb {{[0-9]+}}(%esp)
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addb %cl, %dl
 ; X86-NEXT:    sarb $7, %dl
@@ -150,14 +150,14 @@ define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
 ; X86-NEXT:    shlb $4, %al
 ; X86-NEXT:    sarb $4, %al
 ; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl $7, %ecx
 ; X86-NEXT:    cmpb $7, %al
-; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    cmovll %ecx, %eax
-; X86-NEXT:    cmpb $-7, %al
-; X86-NEXT:    movl $248, %ecx
-; X86-NEXT:    cmovgel %eax, %ecx
-; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl $248, %eax
+; X86-NEXT:    cmpb $-7, %cl
+; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func4:
diff --git a/llvm/test/CodeGen/X86/sar_fold.ll b/llvm/test/CodeGen/X86/sar_fold.ll
index 0f1396954b03a..f94a31ca89dfe 100644
--- a/llvm/test/CodeGen/X86/sar_fold.ll
+++ b/llvm/test/CodeGen/X86/sar_fold.ll
@@ -50,11 +50,11 @@ define void @shl144sar48(ptr %p) #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movswl (%eax), %ecx
+; CHECK-NEXT:    leal (,%ecx,4), %edx
+; CHECK-NEXT:    movl %edx, 12(%eax)
 ; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:    sarl $31, %edx
 ; CHECK-NEXT:    shldl $2, %ecx, %edx
-; CHECK-NEXT:    shll $2, %ecx
-; CHECK-NEXT:    movl %ecx, 12(%eax)
 ; CHECK-NEXT:    movl %edx, 16(%eax)
 ; CHECK-NEXT:    movl $0, 8(%eax)
 ; CHECK-NEXT:    movl $0, 4(%eax)
diff --git a/llvm/test/CodeGen/X86/scalar-ext-logic.ll b/llvm/test/CodeGen/X86/scalar-ext-logic.ll
index 9926a5be6688c..e15b921627d9c 100644
--- a/llvm/test/CodeGen/X86/scalar-ext-logic.ll
+++ b/llvm/test/CodeGen/X86/scalar-ext-logic.ll
@@ -7,8 +7,8 @@ define i32 @sextinreg_i32(ptr %p0, ptr %p1) {
 ; X86-LABEL: sextinreg_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl (%ecx), %ecx
+; X86-NEXT:    movzbl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    shll $27, %eax
@@ -40,13 +40,13 @@ define i32 @sextinreg_i32_mismatch(ptr %p0, ptr %p1) {
 ; X86-LABEL: sextinreg_i32_mismatch:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl (%ecx), %ecx
+; X86-NEXT:    movzbl (%eax), %ecx
+; X86-NEXT:    shll $27, %ecx
+; X86-NEXT:    sarl $27, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl (%eax), %eax
-; X86-NEXT:    shll $30, %ecx
-; X86-NEXT:    sarl $30, %ecx
-; X86-NEXT:    shll $27, %eax
-; X86-NEXT:    sarl $27, %eax
+; X86-NEXT:    shll $30, %eax
+; X86-NEXT:    sarl $30, %eax
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/scalar-extract.ll b/llvm/test/CodeGen/X86/scalar-extract.ll
index f630abb024d69..94002b4329acd 100644
--- a/llvm/test/CodeGen/X86/scalar-extract.ll
+++ b/llvm/test/CodeGen/X86/scalar-extract.ll
@@ -8,9 +8,9 @@ define void @foo(ptr %A, ptr %B) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%ecx), %ecx
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    retl
 entry:
 	%tmp1 = load <2 x i16>, ptr %A		; <<2 x i16>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index ce9723b3a84bc..bccb180fba939 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -803,27 +803,39 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ; X86-SSE-WIN:       # %bb.0:
 ; X86-SSE-WIN-NEXT:    pushl %ebp
 ; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    pushl %esi
 ; X86-SSE-WIN-NEXT:    andl $-16, %esp
 ; X86-SSE-WIN-NEXT:    subl $16, %esp
-; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
+; X86-SSE-WIN-NEXT:    movl 16(%ebp), %eax
+; X86-SSE-WIN-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE-WIN-NEXT:    movl 8(%ebp), %edx
+; X86-SSE-WIN-NEXT:    movl 12(%ebp), %esi
+; X86-SSE-WIN-NEXT:    pushl %ecx
+; X86-SSE-WIN-NEXT:    pushl %eax
+; X86-SSE-WIN-NEXT:    pushl %esi
+; X86-SSE-WIN-NEXT:    pushl %edx
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
-; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    leal -4(%ebp), %esp
+; X86-SSE-WIN-NEXT:    popl %esi
 ; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u32:
 ; X86-SSE-LIN:       # %bb.0:
-; X86-SSE-LIN-NEXT:    subl $12, %esp
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    subl $8, %esp
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-LIN-NEXT:    pushl %eax
+; X86-SSE-LIN-NEXT:    pushl %ecx
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    pushl %edx
 ; X86-SSE-LIN-NEXT:    calll __fixunstfsi
-; X86-SSE-LIN-NEXT:    addl $28, %esp
+; X86-SSE-LIN-NEXT:    addl $24, %esp
+; X86-SSE-LIN-NEXT:    popl %esi
 ; X86-SSE-LIN-NEXT:    retl
 ;
 ; X64-SSE-WIN-LABEL: t_to_u32:
@@ -847,27 +859,39 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ; X87-WIN:       # %bb.0:
 ; X87-WIN-NEXT:    pushl %ebp
 ; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    pushl %esi
 ; X87-WIN-NEXT:    andl $-16, %esp
 ; X87-WIN-NEXT:    subl $16, %esp
-; X87-WIN-NEXT:    pushl 20(%ebp)
-; X87-WIN-NEXT:    pushl 16(%ebp)
-; X87-WIN-NEXT:    pushl 12(%ebp)
-; X87-WIN-NEXT:    pushl 8(%ebp)
+; X87-WIN-NEXT:    movl 16(%ebp), %eax
+; X87-WIN-NEXT:    movl 20(%ebp), %ecx
+; X87-WIN-NEXT:    movl 8(%ebp), %edx
+; X87-WIN-NEXT:    movl 12(%ebp), %esi
+; X87-WIN-NEXT:    pushl %ecx
+; X87-WIN-NEXT:    pushl %eax
+; X87-WIN-NEXT:    pushl %esi
+; X87-WIN-NEXT:    pushl %edx
 ; X87-WIN-NEXT:    calll ___fixunstfsi
 ; X87-WIN-NEXT:    addl $16, %esp
-; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    leal -4(%ebp), %esp
+; X87-WIN-NEXT:    popl %esi
 ; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u32:
 ; X87-LIN:       # %bb.0:
-; X87-LIN-NEXT:    subl $12, %esp
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    subl $8, %esp
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X87-LIN-NEXT:    pushl %eax
+; X87-LIN-NEXT:    pushl %ecx
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    pushl %edx
 ; X87-LIN-NEXT:    calll __fixunstfsi
-; X87-LIN-NEXT:    addl $28, %esp
+; X87-LIN-NEXT:    addl $24, %esp
+; X87-LIN-NEXT:    popl %esi
 ; X87-LIN-NEXT:    retl
   %r = fptoui fp128 %a to i32
   ret i32 %r
@@ -917,27 +941,39 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ; X86-SSE-WIN:       # %bb.0:
 ; X86-SSE-WIN-NEXT:    pushl %ebp
 ; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    pushl %esi
 ; X86-SSE-WIN-NEXT:    andl $-16, %esp
 ; X86-SSE-WIN-NEXT:    subl $16, %esp
-; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
+; X86-SSE-WIN-NEXT:    movl 16(%ebp), %eax
+; X86-SSE-WIN-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE-WIN-NEXT:    movl 8(%ebp), %edx
+; X86-SSE-WIN-NEXT:    movl 12(%ebp), %esi
+; X86-SSE-WIN-NEXT:    pushl %ecx
+; X86-SSE-WIN-NEXT:    pushl %eax
+; X86-SSE-WIN-NEXT:    pushl %esi
+; X86-SSE-WIN-NEXT:    pushl %edx
 ; X86-SSE-WIN-NEXT:    calll ___fixtfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
-; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    leal -4(%ebp), %esp
+; X86-SSE-WIN-NEXT:    popl %esi
 ; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s32:
 ; X86-SSE-LIN:       # %bb.0:
-; X86-SSE-LIN-NEXT:    subl $12, %esp
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    subl $8, %esp
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-LIN-NEXT:    pushl %eax
+; X86-SSE-LIN-NEXT:    pushl %ecx
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    pushl %edx
 ; X86-SSE-LIN-NEXT:    calll __fixtfsi
-; X86-SSE-LIN-NEXT:    addl $28, %esp
+; X86-SSE-LIN-NEXT:    addl $24, %esp
+; X86-SSE-LIN-NEXT:    popl %esi
 ; X86-SSE-LIN-NEXT:    retl
 ;
 ; X64-SSE-WIN-LABEL: t_to_s32:
@@ -961,27 +997,39 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ; X87-WIN:       # %bb.0:
 ; X87-WIN-NEXT:    pushl %ebp
 ; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    pushl %esi
 ; X87-WIN-NEXT:    andl $-16, %esp
 ; X87-WIN-NEXT:    subl $16, %esp
-; X87-WIN-NEXT:    pushl 20(%ebp)
-; X87-WIN-NEXT:    pushl 16(%ebp)
-; X87-WIN-NEXT:    pushl 12(%ebp)
-; X87-WIN-NEXT:    pushl 8(%ebp)
+; X87-WIN-NEXT:    movl 16(%ebp), %eax
+; X87-WIN-NEXT:    movl 20(%ebp), %ecx
+; X87-WIN-NEXT:    movl 8(%ebp), %edx
+; X87-WIN-NEXT:    movl 12(%ebp), %esi
+; X87-WIN-NEXT:    pushl %ecx
+; X87-WIN-NEXT:    pushl %eax
+; X87-WIN-NEXT:    pushl %esi
+; X87-WIN-NEXT:    pushl %edx
 ; X87-WIN-NEXT:    calll ___fixtfsi
 ; X87-WIN-NEXT:    addl $16, %esp
-; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    leal -4(%ebp), %esp
+; X87-WIN-NEXT:    popl %esi
 ; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s32:
 ; X87-LIN:       # %bb.0:
-; X87-LIN-NEXT:    subl $12, %esp
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    subl $8, %esp
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X87-LIN-NEXT:    pushl %eax
+; X87-LIN-NEXT:    pushl %ecx
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    pushl %edx
 ; X87-LIN-NEXT:    calll __fixtfsi
-; X87-LIN-NEXT:    addl $28, %esp
+; X87-LIN-NEXT:    addl $24, %esp
+; X87-LIN-NEXT:    popl %esi
 ; X87-LIN-NEXT:    retl
   %r = fptosi fp128 %a to i32
   ret i32 %r
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index 3287869f2c601..40dacd029a616 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -911,14 +911,15 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
 ; X86-AVX512-WIN-NEXT:    andl $-16, %esp
 ; X86-AVX512-WIN-NEXT:    subl $16, %esp
+; X86-AVX512-WIN-NEXT:    fldz
 ; X86-AVX512-WIN-NEXT:    fldt 8(%ebp)
 ; X86-AVX512-WIN-NEXT:    flds __real at 5f000000
 ; X86-AVX512-WIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512-WIN-NEXT:    fucomi %st(1), %st
-; X86-AVX512-WIN-NEXT:    fldz
-; X86-AVX512-WIN-NEXT:    fcmovbe %st(1), %st
-; X86-AVX512-WIN-NEXT:    fstp %st(1)
-; X86-AVX512-WIN-NEXT:    fsubrp %st, %st(1)
+; X86-AVX512-WIN-NEXT:    fxch %st(2)
+; X86-AVX512-WIN-NEXT:    fcmovbe %st(2), %st
+; X86-AVX512-WIN-NEXT:    fstp %st(2)
+; X86-AVX512-WIN-NEXT:    fsubp %st, %st(1)
 ; X86-AVX512-WIN-NEXT:    fisttpll (%esp)
 ; X86-AVX512-WIN-NEXT:    setbe %dl
 ; X86-AVX512-WIN-NEXT:    shll $31, %edx
@@ -931,14 +932,15 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-AVX512-LIN-LABEL: x_to_u64:
 ; X86-AVX512-LIN:       # %bb.0:
 ; X86-AVX512-LIN-NEXT:    subl $12, %esp
+; X86-AVX512-LIN-NEXT:    fldz
 ; X86-AVX512-LIN-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-AVX512-LIN-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-AVX512-LIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512-LIN-NEXT:    fucomi %st(1), %st
-; X86-AVX512-LIN-NEXT:    fldz
-; X86-AVX512-LIN-NEXT:    fcmovbe %st(1), %st
-; X86-AVX512-LIN-NEXT:    fstp %st(1)
-; X86-AVX512-LIN-NEXT:    fsubrp %st, %st(1)
+; X86-AVX512-LIN-NEXT:    fxch %st(2)
+; X86-AVX512-LIN-NEXT:    fcmovbe %st(2), %st
+; X86-AVX512-LIN-NEXT:    fstp %st(2)
+; X86-AVX512-LIN-NEXT:    fsubp %st, %st(1)
 ; X86-AVX512-LIN-NEXT:    fisttpll (%esp)
 ; X86-AVX512-LIN-NEXT:    setbe %dl
 ; X86-AVX512-LIN-NEXT:    shll $31, %edx
@@ -987,14 +989,15 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-SSE3-WIN-NEXT:    movl %esp, %ebp
 ; X86-SSE3-WIN-NEXT:    andl $-16, %esp
 ; X86-SSE3-WIN-NEXT:    subl $16, %esp
+; X86-SSE3-WIN-NEXT:    fldz
 ; X86-SSE3-WIN-NEXT:    fldt 8(%ebp)
 ; X86-SSE3-WIN-NEXT:    flds __real at 5f000000
 ; X86-SSE3-WIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-WIN-NEXT:    fucomi %st(1), %st
-; X86-SSE3-WIN-NEXT:    fldz
-; X86-SSE3-WIN-NEXT:    fcmovbe %st(1), %st
-; X86-SSE3-WIN-NEXT:    fstp %st(1)
-; X86-SSE3-WIN-NEXT:    fsubrp %st, %st(1)
+; X86-SSE3-WIN-NEXT:    fxch %st(2)
+; X86-SSE3-WIN-NEXT:    fcmovbe %st(2), %st
+; X86-SSE3-WIN-NEXT:    fstp %st(2)
+; X86-SSE3-WIN-NEXT:    fsubp %st, %st(1)
 ; X86-SSE3-WIN-NEXT:    fisttpll (%esp)
 ; X86-SSE3-WIN-NEXT:    setbe %dl
 ; X86-SSE3-WIN-NEXT:    shll $31, %edx
@@ -1007,14 +1010,15 @@ define i64 @x_to_u64(x86_fp80 %a) nounwind {
 ; X86-SSE3-LIN-LABEL: x_to_u64:
 ; X86-SSE3-LIN:       # %bb.0:
 ; X86-SSE3-LIN-NEXT:    subl $12, %esp
+; X86-SSE3-LIN-NEXT:    fldz
 ; X86-SSE3-LIN-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-SSE3-LIN-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-SSE3-LIN-NEXT:    xorl %edx, %edx
 ; X86-SSE3-LIN-NEXT:    fucomi %st(1), %st
-; X86-SSE3-LIN-NEXT:    fldz
-; X86-SSE3-LIN-NEXT:    fcmovbe %st(1), %st
-; X86-SSE3-LIN-NEXT:    fstp %st(1)
-; X86-SSE3-LIN-NEXT:    fsubrp %st, %st(1)
+; X86-SSE3-LIN-NEXT:    fxch %st(2)
+; X86-SSE3-LIN-NEXT:    fcmovbe %st(2), %st
+; X86-SSE3-LIN-NEXT:    fstp %st(2)
+; X86-SSE3-LIN-NEXT:    fsubp %st, %st(1)
 ; X86-SSE3-LIN-NEXT:    fisttpll (%esp)
 ; X86-SSE3-LIN-NEXT:    setbe %dl
 ; X86-SSE3-LIN-NEXT:    shll $31, %edx
@@ -1458,27 +1462,39 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ; X86-SSE-WIN:       # %bb.0:
 ; X86-SSE-WIN-NEXT:    pushl %ebp
 ; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    pushl %esi
 ; X86-SSE-WIN-NEXT:    andl $-16, %esp
 ; X86-SSE-WIN-NEXT:    subl $16, %esp
-; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
+; X86-SSE-WIN-NEXT:    movl 16(%ebp), %eax
+; X86-SSE-WIN-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE-WIN-NEXT:    movl 8(%ebp), %edx
+; X86-SSE-WIN-NEXT:    movl 12(%ebp), %esi
+; X86-SSE-WIN-NEXT:    pushl %ecx
+; X86-SSE-WIN-NEXT:    pushl %eax
+; X86-SSE-WIN-NEXT:    pushl %esi
+; X86-SSE-WIN-NEXT:    pushl %edx
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
-; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    leal -4(%ebp), %esp
+; X86-SSE-WIN-NEXT:    popl %esi
 ; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u64:
 ; X86-SSE-LIN:       # %bb.0:
-; X86-SSE-LIN-NEXT:    subl $12, %esp
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    subl $8, %esp
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-LIN-NEXT:    pushl %eax
+; X86-SSE-LIN-NEXT:    pushl %ecx
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    pushl %edx
 ; X86-SSE-LIN-NEXT:    calll __fixunstfdi
-; X86-SSE-LIN-NEXT:    addl $28, %esp
+; X86-SSE-LIN-NEXT:    addl $24, %esp
+; X86-SSE-LIN-NEXT:    popl %esi
 ; X86-SSE-LIN-NEXT:    retl
 ;
 ; X64-SSE-WIN-LABEL: t_to_u64:
@@ -1502,27 +1518,39 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ; X87-WIN:       # %bb.0:
 ; X87-WIN-NEXT:    pushl %ebp
 ; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    pushl %esi
 ; X87-WIN-NEXT:    andl $-16, %esp
 ; X87-WIN-NEXT:    subl $16, %esp
-; X87-WIN-NEXT:    pushl 20(%ebp)
-; X87-WIN-NEXT:    pushl 16(%ebp)
-; X87-WIN-NEXT:    pushl 12(%ebp)
-; X87-WIN-NEXT:    pushl 8(%ebp)
+; X87-WIN-NEXT:    movl 16(%ebp), %eax
+; X87-WIN-NEXT:    movl 20(%ebp), %ecx
+; X87-WIN-NEXT:    movl 8(%ebp), %edx
+; X87-WIN-NEXT:    movl 12(%ebp), %esi
+; X87-WIN-NEXT:    pushl %ecx
+; X87-WIN-NEXT:    pushl %eax
+; X87-WIN-NEXT:    pushl %esi
+; X87-WIN-NEXT:    pushl %edx
 ; X87-WIN-NEXT:    calll ___fixunstfdi
 ; X87-WIN-NEXT:    addl $16, %esp
-; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    leal -4(%ebp), %esp
+; X87-WIN-NEXT:    popl %esi
 ; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u64:
 ; X87-LIN:       # %bb.0:
-; X87-LIN-NEXT:    subl $12, %esp
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    subl $8, %esp
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X87-LIN-NEXT:    pushl %eax
+; X87-LIN-NEXT:    pushl %ecx
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    pushl %edx
 ; X87-LIN-NEXT:    calll __fixunstfdi
-; X87-LIN-NEXT:    addl $28, %esp
+; X87-LIN-NEXT:    addl $24, %esp
+; X87-LIN-NEXT:    popl %esi
 ; X87-LIN-NEXT:    retl
   %r = fptoui fp128 %a to i64
   ret i64 %r
@@ -1572,27 +1600,39 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ; X86-SSE-WIN:       # %bb.0:
 ; X86-SSE-WIN-NEXT:    pushl %ebp
 ; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    pushl %esi
 ; X86-SSE-WIN-NEXT:    andl $-16, %esp
 ; X86-SSE-WIN-NEXT:    subl $16, %esp
-; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
-; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
+; X86-SSE-WIN-NEXT:    movl 16(%ebp), %eax
+; X86-SSE-WIN-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE-WIN-NEXT:    movl 8(%ebp), %edx
+; X86-SSE-WIN-NEXT:    movl 12(%ebp), %esi
+; X86-SSE-WIN-NEXT:    pushl %ecx
+; X86-SSE-WIN-NEXT:    pushl %eax
+; X86-SSE-WIN-NEXT:    pushl %esi
+; X86-SSE-WIN-NEXT:    pushl %edx
 ; X86-SSE-WIN-NEXT:    calll ___fixtfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
-; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    leal -4(%ebp), %esp
+; X86-SSE-WIN-NEXT:    popl %esi
 ; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s64:
 ; X86-SSE-LIN:       # %bb.0:
-; X86-SSE-LIN-NEXT:    subl $12, %esp
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    subl $8, %esp
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-LIN-NEXT:    pushl %eax
+; X86-SSE-LIN-NEXT:    pushl %ecx
+; X86-SSE-LIN-NEXT:    pushl %esi
+; X86-SSE-LIN-NEXT:    pushl %edx
 ; X86-SSE-LIN-NEXT:    calll __fixtfdi
-; X86-SSE-LIN-NEXT:    addl $28, %esp
+; X86-SSE-LIN-NEXT:    addl $24, %esp
+; X86-SSE-LIN-NEXT:    popl %esi
 ; X86-SSE-LIN-NEXT:    retl
 ;
 ; X64-SSE-WIN-LABEL: t_to_s64:
@@ -1616,27 +1656,39 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ; X87-WIN:       # %bb.0:
 ; X87-WIN-NEXT:    pushl %ebp
 ; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    pushl %esi
 ; X87-WIN-NEXT:    andl $-16, %esp
 ; X87-WIN-NEXT:    subl $16, %esp
-; X87-WIN-NEXT:    pushl 20(%ebp)
-; X87-WIN-NEXT:    pushl 16(%ebp)
-; X87-WIN-NEXT:    pushl 12(%ebp)
-; X87-WIN-NEXT:    pushl 8(%ebp)
+; X87-WIN-NEXT:    movl 16(%ebp), %eax
+; X87-WIN-NEXT:    movl 20(%ebp), %ecx
+; X87-WIN-NEXT:    movl 8(%ebp), %edx
+; X87-WIN-NEXT:    movl 12(%ebp), %esi
+; X87-WIN-NEXT:    pushl %ecx
+; X87-WIN-NEXT:    pushl %eax
+; X87-WIN-NEXT:    pushl %esi
+; X87-WIN-NEXT:    pushl %edx
 ; X87-WIN-NEXT:    calll ___fixtfdi
 ; X87-WIN-NEXT:    addl $16, %esp
-; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    leal -4(%ebp), %esp
+; X87-WIN-NEXT:    popl %esi
 ; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s64:
 ; X87-LIN:       # %bb.0:
-; X87-LIN-NEXT:    subl $12, %esp
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-LIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    subl $8, %esp
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X87-LIN-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X87-LIN-NEXT:    pushl %eax
+; X87-LIN-NEXT:    pushl %ecx
+; X87-LIN-NEXT:    pushl %esi
+; X87-LIN-NEXT:    pushl %edx
 ; X87-LIN-NEXT:    calll __fixtfdi
-; X87-LIN-NEXT:    addl $28, %esp
+; X87-LIN-NEXT:    addl $24, %esp
+; X87-LIN-NEXT:    popl %esi
 ; X87-LIN-NEXT:    retl
   %r = fptosi fp128 %a to i64
   ret i64 %r
diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
index 43c1a84f7cd6c..caaec2233220f 100644
--- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -328,9 +328,9 @@ define float @u64_to_f(i64 %a) nounwind {
 ; AVX512F_32-NEXT:    movl %esp, %ebp
 ; AVX512F_32-NEXT:    andl $-8, %esp
 ; AVX512F_32-NEXT:    subl $16, %esp
-; AVX512F_32-NEXT:    movl 12(%ebp), %eax
 ; AVX512F_32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX512F_32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F_32-NEXT:    movl 12(%ebp), %eax
 ; AVX512F_32-NEXT:    shrl $31, %eax
 ; AVX512F_32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; AVX512F_32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -348,9 +348,9 @@ define float @u64_to_f(i64 %a) nounwind {
 ; SSE2_32-NEXT:    movl %esp, %ebp
 ; SSE2_32-NEXT:    andl $-8, %esp
 ; SSE2_32-NEXT:    subl $16, %esp
-; SSE2_32-NEXT:    movl 12(%ebp), %eax
 ; SSE2_32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2_32-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    movl 12(%ebp), %eax
 ; SSE2_32-NEXT:    shrl $31, %eax
 ; SSE2_32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE2_32-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -577,8 +577,8 @@ define float @s64_to_f_2(i64 %a) nounwind {
 ; SSE1_32-NEXT:    movl 8(%ebp), %eax
 ; SSE1_32-NEXT:    movl 12(%ebp), %ecx
 ; SSE1_32-NEXT:    addl $5, %eax
-; SSE1_32-NEXT:    adcl $0, %ecx
 ; SSE1_32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; SSE1_32-NEXT:    adcl $0, %ecx
 ; SSE1_32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; SSE1_32-NEXT:    fildll {{[0-9]+}}(%esp)
 ; SSE1_32-NEXT:    fstps {{[0-9]+}}(%esp)
@@ -596,8 +596,8 @@ define float @s64_to_f_2(i64 %a) nounwind {
 ; X87-NEXT:    movl 8(%ebp), %eax
 ; X87-NEXT:    movl 12(%ebp), %ecx
 ; X87-NEXT:    addl $5, %eax
-; X87-NEXT:    adcl $0, %ecx
 ; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    adcl $0, %ecx
 ; X87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X87-NEXT:    fildll (%esp)
 ; X87-NEXT:    movl %ebp, %esp
@@ -1034,8 +1034,8 @@ define double @s64_to_d_2(i64 %a) nounwind {
 ; SSE1_32-NEXT:    movl 8(%ebp), %eax
 ; SSE1_32-NEXT:    movl 12(%ebp), %ecx
 ; SSE1_32-NEXT:    addl $5, %eax
-; SSE1_32-NEXT:    adcl $0, %ecx
 ; SSE1_32-NEXT:    movl %eax, (%esp)
+; SSE1_32-NEXT:    adcl $0, %ecx
 ; SSE1_32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; SSE1_32-NEXT:    fildll (%esp)
 ; SSE1_32-NEXT:    movl %ebp, %esp
@@ -1051,8 +1051,8 @@ define double @s64_to_d_2(i64 %a) nounwind {
 ; X87-NEXT:    movl 8(%ebp), %eax
 ; X87-NEXT:    movl 12(%ebp), %ecx
 ; X87-NEXT:    addl $5, %eax
-; X87-NEXT:    adcl $0, %ecx
 ; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    adcl $0, %ecx
 ; X87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X87-NEXT:    fildll (%esp)
 ; X87-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 393e05bfd0cc6..010f2f9a74a16 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -515,12 +515,12 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: scmp_wide_result:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
@@ -736,41 +736,35 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: scmp_normal_vectors:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    setg %dh
-; X86-NEXT:    subb %dl, %dh
-; X86-NEXT:    movsbl %dh, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movsbl %bh, %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movsbl %bh, %esi
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    setg %ch
-; X86-NEXT:    subb %cl, %ch
-; X86-NEXT:    movsbl %ch, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.scmp(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %1
@@ -893,37 +887,31 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: scmp_narrow_vec_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 3(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setl %ch
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %ch, %cl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    setl %ch
-; X86-NEXT:    setg %bl
-; X86-NEXT:    subb %ch, %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setl %ch
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %ch, %bh
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    setg %ch
-; X86-NEXT:    subb %dl, %ch
-; X86-NEXT:    movb %ch, 3(%eax)
-; X86-NEXT:    movb %bh, 2(%eax)
-; X86-NEXT:    movb %bl, 1(%eax)
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movb %dl, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movb %dl, 1(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movb %dl, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; SETZUCC-LABEL: scmp_narrow_vec_result:
@@ -1068,41 +1056,35 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
 ;
 ; X86-LABEL: scmp_narrow_vec_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    setl %al
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    setl %dl
-; X86-NEXT:    setg %dh
-; X86-NEXT:    subb %dl, %dh
-; X86-NEXT:    movsbl %dh, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    setl %bl
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movsbl %bh, %esi
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    setl %ch
-; X86-NEXT:    setg %bl
-; X86-NEXT:    subb %ch, %bl
-; X86-NEXT:    movsbl %bl, %edi
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    setg %ch
-; X86-NEXT:    subb %cl, %ch
-; X86-NEXT:    movsbl %ch, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; SETZUCC-LABEL: scmp_narrow_vec_op:
@@ -1245,140 +1227,119 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ;
 ; X86-LABEL: scmp_wide_vec_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %cl
 ; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %al, %bh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %bl
-; X86-NEXT:    subb %al, %bl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dh
-; X86-NEXT:    subb %al, %dh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 60(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %ebp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %ah
-; X86-NEXT:    subb %al, %ah
-; X86-NEXT:    movsbl %ah, %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    setl %al
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movsbl %dl, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %esi, 56(%eax)
-; X86-NEXT:    movl %edi, 52(%eax)
-; X86-NEXT:    movl %ebp, 48(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movsbl %dh, %edx
-; X86-NEXT:    movl %edx, 36(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bl, %esi
-; X86-NEXT:    movl %esi, 32(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bh, %edi
-; X86-NEXT:    movl %edi, 28(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    movl %ebx, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl %esi, 16(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; SETZUCC-LABEL: scmp_wide_vec_result:
@@ -1981,249 +1942,221 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ;
 ; X86-LABEL: scmp_wide_vec_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    setl %ah
-; X86-NEXT:    subb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %ecx, %ebp
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %ebp, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setl %ah
-; X86-NEXT:    subb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %edi, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setl %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %ebp, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    subb %bl, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    setl %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 11(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setl %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 10(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    setl %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 9(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    setl %dl
-; X86-NEXT:    subb %bl, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 8(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    subb %bl, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    setl %al
-; X86-NEXT:    subb %bl, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 7(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebp, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %ecx, %ebp
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 6(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebp, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %ecx, %ebp
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 5(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    setl %dl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    setl %ch
-; X86-NEXT:    subb %dl, %ch
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    setl %dl
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    setl %dh
-; X86-NEXT:    cmpl %esi, %ebx
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    subb %dh, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subb %bl, %cl
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    setl %dh
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %bl
-; X86-NEXT:    subb %dh, %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %bl, 15(%eax)
-; X86-NEXT:    movb %cl, 14(%eax)
-; X86-NEXT:    movb %dl, 13(%eax)
-; X86-NEXT:    movb %ch, 12(%eax)
-; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    subb %bl, %cl
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; SETZUCC-LABEL: scmp_wide_vec_op:
@@ -2979,200 +2912,182 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    subl $44, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
 ; X86-NEXT:    sarb %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    addb %dh, %dh
-; X86-NEXT:    sarb %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    addb %dl, %dl
-; X86-NEXT:    sarb %dl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    sarb %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    addb %ah, %ah
-; X86-NEXT:    sarb %ah
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addb %cl, %cl
 ; X86-NEXT:    sarb %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    addb %ch, %ch
-; X86-NEXT:    sarb %ch
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addb %bl, %bl
-; X86-NEXT:    sarb %bl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    addb %bh, %bh
-; X86-NEXT:    sarb %bh
-; X86-NEXT:    cmpb %bl, %bh
-; X86-NEXT:    setl %bl
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %bl, %bh
-; X86-NEXT:    movsbl %bh, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $2097151, %esi # imm = 0x1FFFFF
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpb %cl, %ch
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    cmpb %cl, %dl
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    setg %ch
-; X86-NEXT:    subb %cl, %ch
-; X86-NEXT:    movsbl %ch, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $2097151, %ecx # imm = 0x1FFFFF
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpb %al, %ah
-; X86-NEXT:    setl %al
-; X86-NEXT:    setg %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movsbl %cl, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $2097151, %eax # imm = 0x1FFFFF
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpb %dh, %dl
-; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movsbl %dl, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    setl %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    subb %cl, %dl
 ; X86-NEXT:    movsbl %dl, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    setl %al
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %dl
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    sarb %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    sarb %dl
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    setl %dl
-; X86-NEXT:    setg %dh
-; X86-NEXT:    subb %dl, %dh
-; X86-NEXT:    movsbl %dh, %edx
+; X86-NEXT:    subb %cl, %dl
+; X86-NEXT:    movsbl %al, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %ebp, 96(%eax)
+; X86-NEXT:    movl %ebp, 92(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 80(%eax)
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, 68(%eax)
+; X86-NEXT:    movl %edi, 64(%eax)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, 52(%eax)
+; X86-NEXT:    movl %ebx, 48(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movsbl %dl, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, 96(%esi)
-; X86-NEXT:    movl %edx, 92(%esi)
-; X86-NEXT:    movl %ebx, 80(%esi)
-; X86-NEXT:    movl %eax, 68(%esi)
-; X86-NEXT:    movl %eax, 64(%esi)
-; X86-NEXT:    movl %edi, 52(%esi)
-; X86-NEXT:    movl %edi, 48(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, 36(%esi)
-; X86-NEXT:    movl %ebp, 24(%esi)
-; X86-NEXT:    movl %ebp, 20(%esi)
-; X86-NEXT:    movl %ecx, 8(%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movw %dx, 100(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $2, %edx, %ecx
-; X86-NEXT:    movl %ecx, 88(%esi)
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl %edx, 20(%eax)
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movw %bp, 100(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $9, %esi, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %ecx, 76(%edx)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $20, %edx, %ecx
-; X86-NEXT:    movl %ecx, 60(%esi)
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $31, %esi, %ecx
-; X86-NEXT:    movl %ecx, 44(%ebx)
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    shrdl $2, %ebp, %ecx
+; X86-NEXT:    movl %ecx, 88(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    andl $2097151, %ecx # imm = 0x1FFFFF
+; X86-NEXT:    shldl $9, %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $9, %edi, %ebp
+; X86-NEXT:    movl %ebp, 76(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shrdl $22, %ebx, %ecx
-; X86-NEXT:    movl %ecx, 32(%edx)
+; X86-NEXT:    shldl $20, %ebx, %ebp
+; X86-NEXT:    movl %ebp, 60(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrdl $11, %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shll $9, %ecx
-; X86-NEXT:    andl $511, %eax # imm = 0x1FF
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 72(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shll $20, %eax
-; X86-NEXT:    andl $1048575, %edi # imm = 0xFFFFF
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, 56(%ecx)
-; X86-NEXT:    shll $10, %esi
-; X86-NEXT:    andl $1023, %ebp # imm = 0x3FF
-; X86-NEXT:    orl %esi, %ebp
-; X86-NEXT:    movl %ebp, 28(%ecx)
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shll $21, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl $7, %eax
-; X86-NEXT:    movb %al, 102(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shll $30, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, 84(%ecx)
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shldl $31, %ebx, %ebp
+; X86-NEXT:    movl %ebp, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl $22, %ebx, %ebp
+; X86-NEXT:    movl %ebp, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shrdl $11, %edx, %ebp
+; X86-NEXT:    movl %ebp, 16(%eax)
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    shll $9, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl $511, %edi # imm = 0x1FF
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    movl %edi, 72(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shll $20, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    andl $1048575, %ebx # imm = 0xFFFFF
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, 56(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shll $10, %edi
+; X86-NEXT:    andl $1023, %edx # imm = 0x3FF
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $10, %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    shll $21, %edx
+; X86-NEXT:    andl $2097151, %esi # imm = 0x1FFFFF
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl $7, %edx
+; X86-NEXT:    movb %dl, 102(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shll $30, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, 84(%eax)
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $2097151, %ecx # imm = 0x1FFFFF
+; X86-NEXT:    shldl $10, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shll $31, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, 40(%eax)
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -3684,22 +3599,22 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind {
 ;
 ; X86-LABEL: scmp_bool_operands:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    negb %cl
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    setl %cl
+; X86-NEXT:    setg %al
+; X86-NEXT:    subb %cl, %al
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $1, %cl
 ; X86-NEXT:    negb %cl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    andb $1, %dl
 ; X86-NEXT:    negb %dl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    andb $1, %ah
-; X86-NEXT:    negb %ah
-; X86-NEXT:    cmpb %al, %ah
-; X86-NEXT:    setl %ah
-; X86-NEXT:    setg %al
-; X86-NEXT:    subb %ah, %al
 ; X86-NEXT:    cmpb %cl, %dl
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
@@ -3836,13 +3751,13 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin
 ;
 ; X86-LABEL: scmp_ret_wider_than_operands:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %eax
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    setg %dl
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 392bc83d9d5d8..3b606df7f9585 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -37,9 +37,9 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
@@ -93,9 +93,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $14, %ecx
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %edi
@@ -156,27 +156,27 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
-; X86-NEXT:    movswl %ax, %esi
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    movswl %ax, %edi
+; X86-NEXT:    shrl $4, %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cwtd
-; X86-NEXT:    idivw %si
+; X86-NEXT:    idivw %di
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
-; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    leal -1(%eax), %esi
 ; X86-NEXT:    testw %cx, %cx
 ; X86-NEXT:    sets %cl
-; X86-NEXT:    testw %si, %si
+; X86-NEXT:    testw %di, %di
 ; X86-NEXT:    sets %ch
 ; X86-NEXT:    xorb %cl, %ch
 ; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    testb %ch, %cl
-; X86-NEXT:    cmovel %eax, %edi
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    movswl %di, %eax
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    movswl %si, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -308,56 +308,58 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %edi
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 16(%ebp), %ecx
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    shldl $31, %edi, %esi
-; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shldl $31, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    shldl $31, %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl 20(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    subl $1, %ebx
 ; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    sets %al
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %bl
-; X86-NEXT:    xorb %al, %bl
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -366,10 +368,10 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %bl, %al
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
@@ -409,9 +411,9 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
@@ -536,166 +538,161 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $60, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shll $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shrl $31, %edx
+; X86-NEXT:    shldl $31, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    shll $31, %ebp
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %ebx, %ecx
+; X86-NEXT:    shldl $31, %edi, %ecx
+; X86-NEXT:    shll $31, %edi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shll $31, %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shrl $31, %edi
-; X86-NEXT:    shldl $31, %edx, %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrl $31, %ebx
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    shll $31, %edi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shll $31, %esi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    shrl $31, %ebp
-; X86-NEXT:    shldl $31, %ecx, %ebp
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %ebp, %edi
+; X86-NEXT:    shll $31, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __divdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %cl
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    sets %dl
 ; X86-NEXT:    xorb %cl, %dl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    testb %dl, %cl
 ; X86-NEXT:    leal -1(%eax), %ecx
 ; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    sets %dl
+; X86-NEXT:    xorb %al, %dl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %cl, %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testb %dl, %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    leal -1(%edx), %eax
+; X86-NEXT:    cmovel %edx, %eax
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %al
+; X86-NEXT:    sets %dl
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    sets %dh
+; X86-NEXT:    xorb %dl, %dh
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, (%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, 12(%esi)
+; X86-NEXT:    movl %eax, 8(%esi)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %cl, %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    leal -1(%eax), %ebp
-; X86-NEXT:    cmovel %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets %al
+; X86-NEXT:    testb %dh, %al
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    leal -1(%ecx), %eax
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    movl %eax, 4(%esi)
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    xorb %al, %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    leal -1(%edi), %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    leal -1(%ebp), %edi
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %bl, %al
-; X86-NEXT:    cmovel %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    cmovel %ebp, %edi
+; X86-NEXT:    movl %edi, (%esi)
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7d41d5bebc8a..53604ff6d7f34 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -45,9 +45,9 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
@@ -61,11 +61,11 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
 ; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; X86-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmpl $-65535, %eax # imm = 0xFFFF0001
 ; X86-NEXT:    movl $-65536, %ecx # imm = 0xFFFF0000
+; X86-NEXT:    cmpl $-65535, %eax # imm = 0xFFFF0001
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -112,9 +112,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $14, %ecx
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
@@ -128,11 +128,11 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
 ; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
 ; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
+; X86-NEXT:    cmpl $16383, %eax # imm = 0x3FFF
 ; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmpl $-16383, %eax # imm = 0xC001
 ; X86-NEXT:    movl $-16384, %ecx # imm = 0xC000
+; X86-NEXT:    cmpl $-16383, %eax # imm = 0xC001
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -184,10 +184,10 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    movswl %ax, %esi
-; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    shrl $4, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cwtd
@@ -204,13 +204,13 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-NEXT:    testb %ch, %cl
 ; X86-NEXT:    cmovnel %edi, %eax
 ; X86-NEXT:    movswl %ax, %ecx
+; X86-NEXT:    movl $16383, %edx # imm = 0x3FFF
 ; X86-NEXT:    cmpl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT:    movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    movswl %ax, %ecx
+; X86-NEXT:    movl $49152, %edx # imm = 0xC000
 ; X86-NEXT:    cmpl $-16383, %ecx # imm = 0xC001
-; X86-NEXT:    movl $49152, %ecx # imm = 0xC000
-; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -282,11 +282,11 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    testb %dl, %cl
 ; X86-NEXT:    cmovel %esi, %eax
-; X86-NEXT:    cmpb $7, %al
 ; X86-NEXT:    movl $7, %ecx
+; X86-NEXT:    cmpb $7, %al
 ; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    cmpb $-7, %cl
 ; X86-NEXT:    movl $248, %eax
+; X86-NEXT:    cmpb $-7, %cl
 ; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
@@ -371,8 +371,6 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 12(%ebp), %edi
 ; X86-NEXT:    movl 16(%ebp), %ecx
 ; X86-NEXT:    movl 20(%ebp), %edx
 ; X86-NEXT:    movl %edx, %eax
@@ -382,18 +380,20 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %ebx
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shldl $31, %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edi
 ; X86-NEXT:    shldl $31, %edi, %ebx
-; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    shll $31, %esi
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl 20(%ebp), %eax
@@ -405,11 +405,11 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -516,9 +516,9 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    cltd
 ; X86-NEXT:    idivl %esi
@@ -532,11 +532,11 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testb %cl, %dl
 ; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmpl $-131071, %eax # imm = 0xFFFE0001
 ; X86-NEXT:    movl $-131072, %ecx # imm = 0xFFFE0000
+; X86-NEXT:    cmpl $-131071, %eax # imm = 0xFFFE0001
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -800,36 +800,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $240, %esp
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __modti3
-; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -837,28 +817,30 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __divti3
+; X86-NEXT:    calll __modti3
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -869,248 +851,265 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    calll __modti3
+; X86-NEXT:    calll __divti3
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __modti3
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bl
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %bh, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
+; X86-NEXT:    testb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl $-1, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl $0, %edi
 ; X86-NEXT:    cmovgel %ebx, %edx
+; X86-NEXT:    cmovgel %ebx, %esi
 ; X86-NEXT:    cmovgel %ebx, %ecx
-; X86-NEXT:    cmovgel %ebx, %eax
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    cmovgel %esi, %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    negl %esi
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    cmovgel %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    sbbl %esi, %edi
 ; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmovgel %ebx, %edi
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmovgel %edx, %ecx
+; X86-NEXT:    cmovgel %ebx, %eax
+; X86-NEXT:    shrdl $1, %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %eax, 12(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    setne %cl
-; X86-NEXT:    testb %bh, %cl
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    sets %dl
+; X86-NEXT:    xorb %bl, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    cmovgel %edx, %edi
+; X86-NEXT:    cmovgel %edx, %ecx
 ; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    cmovgel %ebx, %eax
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    shrdl $1, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    cmovgel %ebx, %ecx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovgel %edx, %eax
+; X86-NEXT:    shrdl $1, %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    setne %cl
-; X86-NEXT:    testb %bh, %cl
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    sets %dl
+; X86-NEXT:    xorb %bl, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    cmovgel %esi, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    cmovgel %esi, %eax
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %ebx
-; X86-NEXT:    shrdl $1, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %edi
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    cmovgel %edx, %edi
+; X86-NEXT:    cmovgel %edx, %ecx
+; X86-NEXT:    movl $-1, %ebx
+; X86-NEXT:    cmovgel %ebx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    cmovgel %ebx, %ecx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    cmovgel %edx, %eax
+; X86-NEXT:    shrdl $1, %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
@@ -1136,10 +1135,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
@@ -1148,29 +1147,23 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    cmovgel %eax, %ebx
 ; X86-NEXT:    cmovgel %eax, %ecx
-; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovgel %edx, %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    cmovgel %edx, %edi
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %edi
-; X86-NEXT:    shrdl $1, %esi, %edi
+; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    shrdl $1, %edi, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll b/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
index e8359cb088dc3..a8371e4232bc8 100644
--- a/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/llvm/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -23,8 +23,8 @@ define i32 @test_basic(i32 %l) #0 {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    movl %esp, %eax
+; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    leal 15(,%esi,4), %ecx
 ; X86-NEXT:    andl $-16, %ecx
 ; X86-NEXT:    subl %ecx, %eax
@@ -47,8 +47,8 @@ define i32 @test_basic(i32 %l) #0 {
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    je .LBB0_6
 ; X86-NEXT:  # %bb.8: # %false
-; X86-NEXT:    decl %esi
 ; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    decl %esi
 ; X86-NEXT:    movl %esi, (%esp)
 ; X86-NEXT:    calll test_basic at PLT
 ; X86-NEXT:    jmp .LBB0_7
diff --git a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
index bd51ca76c59d1..ebaf6dd442b37 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
 
 ; 32-bit catch-all has to use a filter function because that's how it saves the
@@ -15,6 +16,46 @@ declare void @llvm.localescape(...)
 declare ptr @llvm.eh.recoverfp(ptr, ptr)
 
 define i32 @main() personality ptr @_except_handler3 {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $28, %esp
+; CHECK-NEXT:  Lmain$frame_escape_0 = -40
+; CHECK-NEXT:    movl %esp, -36(%ebp)
+; CHECK-NEXT:    movl $-1, -16(%ebp)
+; CHECK-NEXT:    movl $L__ehtable$main, -20(%ebp)
+; CHECK-NEXT:    movl $__except_handler3, -24(%ebp)
+; CHECK-NEXT:    movl %fs:0, %eax
+; CHECK-NEXT:    movl %eax, -28(%ebp)
+; CHECK-NEXT:    leal -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    movl $0, -16(%ebp)
+; CHECK-NEXT:    calll _crash
+; CHECK-NEXT:  LBB0_2: # %__try.cont
+; CHECK-NEXT:    movl -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addl $28, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  LBB0_1: # %__except
+; CHECK-NEXT:  $ehgcr_0_1:
+; CHECK-NEXT:    movl -24(%ebp), %esp
+; CHECK-NEXT:    addl $12, %ebp
+; CHECK-NEXT:    movl $-1, -16(%ebp)
+; CHECK-NEXT:    pushl -40(%ebp)
+; CHECK-NEXT:    pushl $_str
+; CHECK-NEXT:    calll _printf
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    jmp LBB0_2
+; CHECK-NEXT:  Lfunc_end0:
 entry:
   %__exceptioncode = alloca i32, align 4
   call void (...) @llvm.localescape(ptr %__exceptioncode)
@@ -35,6 +76,21 @@ __try.cont:                                       ; preds = %entry, %__except
 }
 
 define internal i32 @"filt$main"() {
+; CHECK-LABEL: filt$main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    movl (%ebp), %eax
+; CHECK-NEXT:    movl $Lmain$parent_frame_offset, %ecx
+; CHECK-NEXT:    movl -20(%eax), %edx
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    movl (%edx), %ecx
+; CHECK-NEXT:    movl (%ecx), %ecx
+; CHECK-NEXT:    movl $Lmain$frame_escape_0, %edx
+; CHECK-NEXT:    movl %ecx, -24(%edx,%eax)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %ebp = tail call ptr @llvm.frameaddress(i32 1)
   %parentfp = tail call ptr @llvm.eh.recoverfp(ptr @main, ptr %ebp)
diff --git a/llvm/test/CodeGen/X86/seh-stack-realign.ll b/llvm/test/CodeGen/X86/seh-stack-realign.ll
index ae687343cc504..125475aca96dc 100644
--- a/llvm/test/CodeGen/X86/seh-stack-realign.ll
+++ b/llvm/test/CodeGen/X86/seh-stack-realign.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
 
 ; 32-bit catch-all has to use a filter function because that's how it saves the
@@ -15,6 +16,50 @@ declare void @llvm.localescape(...)
 declare ptr @llvm.eh.recoverfp(ptr, ptr)
 
 define i32 @main() personality ptr @_except_handler3 {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $40, %esp
+; CHECK-NEXT:    movl %esp, %esi
+; CHECK-NEXT:    movl %ebp, 4(%esi)
+; CHECK-NEXT:  Lmain$frame_escape_0 = 8
+; CHECK-NEXT:    movl %esp, 12(%esi)
+; CHECK-NEXT:    movl $-1, 32(%esi)
+; CHECK-NEXT:    movl $L__ehtable$main, 28(%esi)
+; CHECK-NEXT:    movl $__except_handler3, 24(%esi)
+; CHECK-NEXT:    movl %fs:0, %eax
+; CHECK-NEXT:    movl %eax, 20(%esi)
+; CHECK-NEXT:    leal 20(%esi), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    movl $0, 32(%esi)
+; CHECK-NEXT:    calll _crash
+; CHECK-NEXT:  LBB0_2: # %__try.cont
+; CHECK-NEXT:    movl 20(%esi), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    leal -12(%ebp), %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  LBB0_1: # %__except
+; CHECK-NEXT:  $ehgcr_0_1:
+; CHECK-NEXT:    movl -24(%ebp), %esp
+; CHECK-NEXT:    leal -36(%ebp), %esi
+; CHECK-NEXT:    movl 4(%esi), %ebp
+; CHECK-NEXT:    movl $-1, 32(%esi)
+; CHECK-NEXT:    pushl 8(%esi)
+; CHECK-NEXT:    pushl $_str
+; CHECK-NEXT:    calll _printf
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    jmp LBB0_2
+; CHECK-NEXT:  Lfunc_end0:
 entry:
   ; The EH code allocation is overaligned, triggering realignment.
   %__exceptioncode = alloca i32, align 8
@@ -36,6 +81,21 @@ __try.cont:                                       ; preds = %entry, %__except
 }
 
 define internal i32 @"filt$main"() {
+; CHECK-LABEL: filt$main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    movl (%ebp), %eax
+; CHECK-NEXT:    movl $Lmain$parent_frame_offset, %ecx
+; CHECK-NEXT:    movl -20(%eax), %edx
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    movl (%edx), %ecx
+; CHECK-NEXT:    movl (%ecx), %ecx
+; CHECK-NEXT:    movl $Lmain$frame_escape_0, %edx
+; CHECK-NEXT:    movl %ecx, -24(%edx,%eax)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %ebp = tail call ptr @llvm.frameaddress(i32 1)
   %parentfp = tail call ptr @llvm.eh.recoverfp(ptr @main, ptr %ebp)
diff --git a/llvm/test/CodeGen/X86/select-constant-xor.ll b/llvm/test/CodeGen/X86/select-constant-xor.ll
index efc367d204cf1..de6de4b8ca644 100644
--- a/llvm/test/CodeGen/X86/select-constant-xor.ll
+++ b/llvm/test/CodeGen/X86/select-constant-xor.ll
@@ -224,18 +224,17 @@ define i32 @icmpasrne(i32 %input, i32 %a, i32 %b) {
 define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) {
 ; X86-LABEL: oneusecmp:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    js .LBB10_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl (%ecx), %eax
-; X86-NEXT:    retl
+; X86-NEXT:    jmp .LBB10_3
 ; X86-NEXT:  .LBB10_1:
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:  .LBB10_3:
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    addl (%ecx), %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/select-lea.ll b/llvm/test/CodeGen/X86/select-lea.ll
index c3435dad67974..df18fa6058d89 100644
--- a/llvm/test/CodeGen/X86/select-lea.ll
+++ b/llvm/test/CodeGen/X86/select-lea.ll
@@ -55,18 +55,18 @@ define i32 @sadd_add_load(i32 %x, i32 %y, ptr %pz) nounwind {
 ; CMOV-NEXT:    pushl %esi
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CMOV-NEXT:    leal (%eax,%edx), %esi
-; CMOV-NEXT:    addl (%ecx), %esi
-; CMOV-NEXT:    addl %edx, %eax
-; CMOV-NEXT:    cmovol %esi, %eax
+; CMOV-NEXT:    leal (%eax,%ecx), %edx
+; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CMOV-NEXT:    addl (%esi), %edx
+; CMOV-NEXT:    addl %ecx, %eax
+; CMOV-NEXT:    cmovol %edx, %eax
 ; CMOV-NEXT:    popl %esi
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: sadd_add_load:
 ; NOCMOV:       # %bb.0:
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    leal (%eax,%edx), %ecx
 ; NOCMOV-NEXT:    addl %edx, %eax
 ; NOCMOV-NEXT:    jno .LBB1_2
@@ -135,18 +135,18 @@ define i32 @uadd_add_load(i32 %x, i32 %y, ptr %pz) nounwind {
 ; CMOV-NEXT:    pushl %esi
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CMOV-NEXT:    leal (%eax,%edx), %esi
-; CMOV-NEXT:    addl (%ecx), %esi
-; CMOV-NEXT:    addl %edx, %eax
-; CMOV-NEXT:    cmovbl %esi, %eax
+; CMOV-NEXT:    leal (%eax,%ecx), %edx
+; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CMOV-NEXT:    addl (%esi), %edx
+; CMOV-NEXT:    addl %ecx, %eax
+; CMOV-NEXT:    cmovbl %edx, %eax
 ; CMOV-NEXT:    popl %esi
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: uadd_add_load:
 ; NOCMOV:       # %bb.0:
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    leal (%eax,%edx), %ecx
 ; NOCMOV-NEXT:    addl %edx, %eax
 ; NOCMOV-NEXT:    jae .LBB3_2
@@ -214,19 +214,19 @@ define i32 @ssub_add_load(i32 %x, i32 %y, ptr %pz) nounwind {
 ; CMOV-NEXT:    pushl %esi
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CMOV-NEXT:    movl %eax, %esi
-; CMOV-NEXT:    subl %edx, %esi
-; CMOV-NEXT:    addl (%ecx), %esi
-; CMOV-NEXT:    subl %edx, %eax
-; CMOV-NEXT:    cmovol %esi, %eax
+; CMOV-NEXT:    movl %eax, %edx
+; CMOV-NEXT:    subl %ecx, %edx
+; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CMOV-NEXT:    addl (%esi), %edx
+; CMOV-NEXT:    subl %ecx, %eax
+; CMOV-NEXT:    cmovol %edx, %eax
 ; CMOV-NEXT:    popl %esi
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: ssub_add_load:
 ; NOCMOV:       # %bb.0:
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl %eax, %ecx
 ; NOCMOV-NEXT:    subl %edx, %ecx
 ; NOCMOV-NEXT:    subl %edx, %eax
@@ -295,19 +295,19 @@ define i32 @usub_add_load(i32 %x, i32 %y, ptr %pz) nounwind {
 ; CMOV-NEXT:    pushl %esi
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CMOV-NEXT:    movl %eax, %esi
-; CMOV-NEXT:    subl %edx, %esi
-; CMOV-NEXT:    addl (%ecx), %esi
-; CMOV-NEXT:    subl %edx, %eax
-; CMOV-NEXT:    cmovbl %esi, %eax
+; CMOV-NEXT:    movl %eax, %edx
+; CMOV-NEXT:    subl %ecx, %edx
+; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CMOV-NEXT:    addl (%esi), %edx
+; CMOV-NEXT:    subl %ecx, %eax
+; CMOV-NEXT:    cmovbl %edx, %eax
 ; CMOV-NEXT:    popl %esi
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: usub_add_load:
 ; NOCMOV:       # %bb.0:
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl %eax, %ecx
 ; NOCMOV-NEXT:    subl %edx, %ecx
 ; NOCMOV-NEXT:    subl %edx, %eax
@@ -376,19 +376,19 @@ define i32 @smul_add_load(i32 %x, i32 %y, ptr %pz) nounwind {
 ; CMOV-NEXT:    pushl %esi
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CMOV-NEXT:    movl %eax, %esi
-; CMOV-NEXT:    imull %edx, %esi
-; CMOV-NEXT:    addl (%ecx), %esi
-; CMOV-NEXT:    imull %edx, %eax
-; CMOV-NEXT:    cmovol %esi, %eax
+; CMOV-NEXT:    movl %eax, %edx
+; CMOV-NEXT:    imull %ecx, %edx
+; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CMOV-NEXT:    addl (%esi), %edx
+; CMOV-NEXT:    imull %ecx, %eax
+; CMOV-NEXT:    cmovol %edx, %eax
 ; CMOV-NEXT:    popl %esi
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: smul_add_load:
 ; NOCMOV:       # %bb.0:
-; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; NOCMOV-NEXT:    movl %eax, %ecx
 ; NOCMOV-NEXT:    imull %edx, %ecx
 ; NOCMOV-NEXT:    imull %edx, %eax
@@ -459,14 +459,14 @@ define i32 @umul_add_load(i32 %x, i32 %y, ptr %pz) nounwind {
 ;
 ; CMOV-LABEL: umul_add_load:
 ; CMOV:       # %bb.0:
-; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CMOV-NEXT:    mull {{[0-9]+}}(%esp)
-; CMOV-NEXT:    seto %dl
-; CMOV-NEXT:    movl (%ecx), %ecx
-; CMOV-NEXT:    addl %eax, %ecx
-; CMOV-NEXT:    testb %dl, %dl
-; CMOV-NEXT:    cmovnel %ecx, %eax
+; CMOV-NEXT:    seto %cl
+; CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CMOV-NEXT:    movl (%edx), %edx
+; CMOV-NEXT:    addl %eax, %edx
+; CMOV-NEXT:    testb %cl, %cl
+; CMOV-NEXT:    cmovnel %edx, %eax
 ; CMOV-NEXT:    retl
 ;
 ; NOCMOV-LABEL: umul_add_load:
diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll
index 8a4308a5af64b..c6d0cff49408c 100644
--- a/llvm/test/CodeGen/X86/select-mmx.ll
+++ b/llvm/test/CodeGen/X86/select-mmx.ll
@@ -32,9 +32,9 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    movl $7, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    orl 12(%ebp), %ecx
 ; X86-NEXT:    je .LBB0_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/select-smin-smax.ll b/llvm/test/CodeGen/X86/select-smin-smax.ll
index 3c2ec52f2c261..a417d572dc0d0 100644
--- a/llvm/test/CodeGen/X86/select-smin-smax.ll
+++ b/llvm/test/CodeGen/X86/select-smin-smax.ll
@@ -191,8 +191,8 @@ define i64 @test_i64_smax(i64 %a) nounwind {
 ;
 ; X86-BMI-LABEL: test_i64_smax:
 ; X86-BMI:       # %bb.0:
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    xorl %eax, %eax
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI-NEXT:    testl %edx, %edx
 ; X86-BMI-NEXT:    cmovlel %eax, %edx
 ; X86-BMI-NEXT:    cmovnsl {{[0-9]+}}(%esp), %eax
@@ -250,56 +250,54 @@ define i128 @test_i128_smax(i128 %a) nounwind {
 ;
 ; X86-BMI-LABEL: test_i128_smax:
 ; X86-BMI:       # %bb.0:
-; X86-BMI-NEXT:    pushl %edi
-; X86-BMI-NEXT:    pushl %esi
-; X86-BMI-NEXT:    pushl %eax
+; X86-BMI-NEXT:    subl $12, %esp
+; X86-BMI-NEXT:    xorl %ecx, %ecx
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    testl %edx, %edx
+; X86-BMI-NEXT:    cmovlel %ecx, %edx
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI-NEXT:    xorl %edx, %edx
-; X86-BMI-NEXT:    testl %ecx, %ecx
-; X86-BMI-NEXT:    cmovlel %edx, %ecx
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI-NEXT:    cmovsl %edx, %esi
-; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI-NEXT:    cmovsl %edx, %edi
-; X86-BMI-NEXT:    cmovnsl {{[0-9]+}}(%esp), %edx
-; X86-BMI-NEXT:    movl %ecx, 12(%eax)
+; X86-BMI-NEXT:    movl %edx, 12(%eax)
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    cmovsl %ecx, %edx
 ; X86-BMI-NEXT:    movl %edx, 8(%eax)
-; X86-BMI-NEXT:    movl %edi, 4(%eax)
-; X86-BMI-NEXT:    movl %esi, (%eax)
-; X86-BMI-NEXT:    addl $4, %esp
-; X86-BMI-NEXT:    popl %esi
-; X86-BMI-NEXT:    popl %edi
+; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI-NEXT:    cmovsl %ecx, %edx
+; X86-BMI-NEXT:    movl %edx, 4(%eax)
+; X86-BMI-NEXT:    cmovnsl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT:    movl %ecx, (%eax)
+; X86-BMI-NEXT:    addl $12, %esp
 ; X86-BMI-NEXT:    retl $4
 ;
 ; X86-NOBMI-LABEL: test_i128_smax:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    pushl %edi
-; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    pushl %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    testl %ecx, %ecx
-; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    movl $0, %esi
-; X86-NOBMI-NEXT:    movl $0, %edi
-; X86-NOBMI-NEXT:    js .LBB8_2
-; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    subl $12, %esp
+; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    testl %edx, %edx
+; X86-NOBMI-NEXT:    jg .LBB8_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl $0, %edx
 ; X86-NOBMI-NEXT:  .LBB8_2:
-; X86-NOBMI-NEXT:    jg .LBB8_4
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %edx, 12(%eax)
+; X86-NOBMI-NEXT:    movl $0, %edx
+; X86-NOBMI-NEXT:    js .LBB8_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    xorl %ecx, %ecx
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:  .LBB8_4:
-; X86-NOBMI-NEXT:    movl %ecx, 12(%eax)
-; X86-NOBMI-NEXT:    movl %edi, 8(%eax)
-; X86-NOBMI-NEXT:    movl %esi, 4(%eax)
-; X86-NOBMI-NEXT:    movl %edx, (%eax)
-; X86-NOBMI-NEXT:    addl $4, %esp
-; X86-NOBMI-NEXT:    popl %esi
-; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    movl %edx, 8(%eax)
+; X86-NOBMI-NEXT:    movl $0, %edx
+; X86-NOBMI-NEXT:    js .LBB8_6
+; X86-NOBMI-NEXT:  # %bb.5:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:  .LBB8_6:
+; X86-NOBMI-NEXT:    movl %edx, 4(%eax)
+; X86-NOBMI-NEXT:    js .LBB8_8
+; X86-NOBMI-NEXT:  # %bb.7:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:  .LBB8_8:
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    retl $4
   %r = call i128 @llvm.smax.i128(i128 %a, i128 0)
   ret i128 %r
@@ -317,26 +315,22 @@ define i128 @test_i128_smin(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_i128_smin:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    andl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl %edx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %ecx, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    addl $4, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.smin.i128(i128 %a, i128 0)
   ret i128 %r
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index fe90028aa6aff..60f752f7bb2fe 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -26,9 +26,9 @@ define i32 @test1(ptr %p, ptr %q, i1 %r) nounwind {
 ;
 ; ATHLON-LABEL: test1:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    cmovnel %eax, %ecx
 ; ATHLON-NEXT:    movl (%ecx), %eax
 ; ATHLON-NEXT:    movl 8(%eax), %eax
@@ -91,10 +91,10 @@ define i32 @test2() nounwind {
 ; ATHLON-NEXT:    subl $12, %esp
 ; ATHLON-NEXT:    calll _return_false
 ; ATHLON-NEXT:    xorl %ecx, %ecx
+; ATHLON-NEXT:    movl $-3840, %edx ## imm = 0xF100
 ; ATHLON-NEXT:    testb $1, %al
-; ATHLON-NEXT:    movl $-3840, %eax ## imm = 0xF100
-; ATHLON-NEXT:    cmovnel %ecx, %eax
-; ATHLON-NEXT:    cmpl $32768, %eax ## imm = 0x8000
+; ATHLON-NEXT:    cmovnel %ecx, %edx
+; ATHLON-NEXT:    cmpl $32768, %edx ## imm = 0x8000
 ; ATHLON-NEXT:    jge LBB1_1
 ; ATHLON-NEXT:  ## %bb.2: ## %bb91
 ; ATHLON-NEXT:    xorl %eax, %eax
@@ -184,14 +184,14 @@ define signext i8 @test4(ptr nocapture %P, double %F) nounwind readonly {
 ;
 ; ATHLON-LABEL: test4:
 ; ATHLON:       ## %bb.0: ## %entry
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    fldl {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
-; ATHLON-NEXT:    xorl %ecx, %ecx
+; ATHLON-NEXT:    xorl %eax, %eax
 ; ATHLON-NEXT:    fucompi %st(1), %st
 ; ATHLON-NEXT:    fstp %st(0)
-; ATHLON-NEXT:    seta %cl
-; ATHLON-NEXT:    movsbl (%eax,%ecx,4), %eax
+; ATHLON-NEXT:    seta %al
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movsbl (%ecx,%eax,4), %eax
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: test4:
@@ -240,35 +240,34 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, ptr %p) nounwind {
 ;
 ; ATHLON-LABEL: test5:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    pushl %esi
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovnel %eax, %ecx
+; ATHLON-NEXT:    movzwl (%ecx), %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movw %ax, 2(%ecx)
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    cmovnel %ecx, %edx
-; ATHLON-NEXT:    movzwl (%edx), %ecx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; ATHLON-NEXT:    cmovnel %edx, %esi
-; ATHLON-NEXT:    movzwl (%esi), %edx
-; ATHLON-NEXT:    movw %dx, 2(%eax)
-; ATHLON-NEXT:    movw %cx, (%eax)
-; ATHLON-NEXT:    popl %esi
+; ATHLON-NEXT:    cmovnel %eax, %edx
+; ATHLON-NEXT:    movzwl (%edx), %eax
+; ATHLON-NEXT:    movw %ax, (%ecx)
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: test5:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    pushl %esi
-; MCU-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; MCU-NEXT:    testb $1, %al
 ; MCU-NEXT:    jne .LBB4_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; MCU-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; MCU-NEXT:  .LBB4_2:
-; MCU-NEXT:    movw %cx, 2(%esi)
-; MCU-NEXT:    movw %dx, (%esi)
-; MCU-NEXT:    popl %esi
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    movw %cx, 2(%eax)
+; MCU-NEXT:    jne .LBB4_4
+; MCU-NEXT:  # %bb.3:
+; MCU-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; MCU-NEXT:  .LBB4_4:
+; MCU-NEXT:    movw %dx, (%eax)
 ; MCU-NEXT:    retl
   %x = select i1 %c, <2 x i16> %a, <2 x i16> %b
   store <2 x i16> %x, ptr %p
@@ -293,90 +292,86 @@ define void @test6(i32 %C, ptr %A, ptr %B) nounwind {
 ;
 ; ATHLON-LABEL: test6:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    flds 12(%eax)
-; ATHLON-NEXT:    flds 8(%eax)
-; ATHLON-NEXT:    flds 4(%eax)
-; ATHLON-NEXT:    flds (%eax)
-; ATHLON-NEXT:    flds (%ecx)
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    flds 12(%ecx)
 ; ATHLON-NEXT:    fmul %st, %st(0)
 ; ATHLON-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    fxch %st(1)
 ; ATHLON-NEXT:    fcmove %st(1), %st
 ; ATHLON-NEXT:    fstp %st(1)
-; ATHLON-NEXT:    flds 4(%ecx)
-; ATHLON-NEXT:    fmul %st, %st(0)
-; ATHLON-NEXT:    fxch %st(2)
-; ATHLON-NEXT:    fcmove %st(2), %st
-; ATHLON-NEXT:    fstp %st(2)
+; ATHLON-NEXT:    fstps 12(%eax)
+; ATHLON-NEXT:    flds 8(%eax)
 ; ATHLON-NEXT:    flds 8(%ecx)
 ; ATHLON-NEXT:    fmul %st, %st(0)
-; ATHLON-NEXT:    fxch %st(3)
-; ATHLON-NEXT:    fcmove %st(3), %st
-; ATHLON-NEXT:    fstp %st(3)
-; ATHLON-NEXT:    flds 12(%ecx)
-; ATHLON-NEXT:    fmul %st, %st(0)
-; ATHLON-NEXT:    fxch %st(4)
-; ATHLON-NEXT:    fcmove %st(4), %st
-; ATHLON-NEXT:    fstp %st(4)
-; ATHLON-NEXT:    fxch %st(3)
-; ATHLON-NEXT:    fstps 12(%eax)
 ; ATHLON-NEXT:    fxch %st(1)
+; ATHLON-NEXT:    fcmove %st(1), %st
+; ATHLON-NEXT:    fstp %st(1)
 ; ATHLON-NEXT:    fstps 8(%eax)
+; ATHLON-NEXT:    flds 4(%eax)
+; ATHLON-NEXT:    flds 4(%ecx)
+; ATHLON-NEXT:    fmul %st, %st(0)
+; ATHLON-NEXT:    fxch %st(1)
+; ATHLON-NEXT:    fcmove %st(1), %st
+; ATHLON-NEXT:    fstp %st(1)
 ; ATHLON-NEXT:    fstps 4(%eax)
+; ATHLON-NEXT:    flds (%eax)
+; ATHLON-NEXT:    flds (%ecx)
+; ATHLON-NEXT:    fmul %st, %st(0)
+; ATHLON-NEXT:    fxch %st(1)
+; ATHLON-NEXT:    fcmove %st(1), %st
+; ATHLON-NEXT:    fstp %st(1)
 ; ATHLON-NEXT:    fstps (%eax)
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: test6:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    pushl %eax
 ; MCU-NEXT:    flds 12(%edx)
-; MCU-NEXT:    fstps (%esp) # 4-byte Folded Spill
-; MCU-NEXT:    flds 8(%edx)
-; MCU-NEXT:    flds 4(%edx)
-; MCU-NEXT:    flds (%ecx)
-; MCU-NEXT:    flds 4(%ecx)
-; MCU-NEXT:    flds 8(%ecx)
 ; MCU-NEXT:    flds 12(%ecx)
 ; MCU-NEXT:    fmul %st, %st(0)
-; MCU-NEXT:    fxch %st(1)
-; MCU-NEXT:    fmul %st, %st(0)
-; MCU-NEXT:    fxch %st(2)
-; MCU-NEXT:    fmul %st, %st(0)
-; MCU-NEXT:    fxch %st(3)
-; MCU-NEXT:    fmul %st, %st(0)
 ; MCU-NEXT:    testl %eax, %eax
-; MCU-NEXT:    flds (%edx)
 ; MCU-NEXT:    je .LBB5_2
 ; MCU-NEXT:  # %bb.1:
-; MCU-NEXT:    fstp %st(1)
-; MCU-NEXT:    fstp %st(3)
-; MCU-NEXT:    fstp %st(1)
 ; MCU-NEXT:    fstp %st(0)
-; MCU-NEXT:    flds (%esp) # 4-byte Folded Reload
-; MCU-NEXT:    fldz
 ; MCU-NEXT:    fldz
-; MCU-NEXT:    fldz
-; MCU-NEXT:    fxch %st(1)
-; MCU-NEXT:    fxch %st(6)
-; MCU-NEXT:    fxch %st(1)
-; MCU-NEXT:    fxch %st(5)
-; MCU-NEXT:    fxch %st(4)
 ; MCU-NEXT:    fxch %st(1)
-; MCU-NEXT:    fxch %st(3)
-; MCU-NEXT:    fxch %st(2)
 ; MCU-NEXT:  .LBB5_2:
-; MCU-NEXT:    fstp %st(0)
-; MCU-NEXT:    fstp %st(5)
-; MCU-NEXT:    fstp %st(3)
-; MCU-NEXT:    fxch %st(2)
+; MCU-NEXT:    fstp %st(1)
 ; MCU-NEXT:    fstps 12(%edx)
+; MCU-NEXT:    flds 8(%edx)
+; MCU-NEXT:    flds 8(%ecx)
+; MCU-NEXT:    fmul %st, %st(0)
+; MCU-NEXT:    je .LBB5_4
+; MCU-NEXT:  # %bb.3:
+; MCU-NEXT:    fstp %st(0)
+; MCU-NEXT:    fldz
 ; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:  .LBB5_4:
+; MCU-NEXT:    fstp %st(1)
 ; MCU-NEXT:    fstps 8(%edx)
+; MCU-NEXT:    flds 4(%edx)
+; MCU-NEXT:    flds 4(%ecx)
+; MCU-NEXT:    fmul %st, %st(0)
+; MCU-NEXT:    je .LBB5_6
+; MCU-NEXT:  # %bb.5:
+; MCU-NEXT:    fstp %st(0)
+; MCU-NEXT:    fldz
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:  .LBB5_6:
+; MCU-NEXT:    fstp %st(1)
 ; MCU-NEXT:    fstps 4(%edx)
+; MCU-NEXT:    flds (%edx)
+; MCU-NEXT:    flds (%ecx)
+; MCU-NEXT:    fmul %st, %st(0)
+; MCU-NEXT:    je .LBB5_8
+; MCU-NEXT:  # %bb.7:
+; MCU-NEXT:    fstp %st(0)
+; MCU-NEXT:    fldz
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:  .LBB5_8:
+; MCU-NEXT:    fstp %st(1)
 ; MCU-NEXT:    fstps (%edx)
-; MCU-NEXT:    popl %eax
 ; MCU-NEXT:    retl
   %tmp = load <4 x float>, ptr %A
   %tmp3 = load <4 x float>, ptr %B
@@ -505,44 +500,44 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi
 ; ATHLON-NEXT:    pushl %ebx
 ; ATHLON-NEXT:    pushl %edi
 ; ATHLON-NEXT:    pushl %esi
-; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ebx
-; ATHLON-NEXT:    cmovnel %eax, %ebx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; ATHLON-NEXT:    cmovnel %eax, %edi
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; ATHLON-NEXT:    cmovnel %eax, %esi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovnel %ecx, %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    cmovnel %eax, %edx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; ATHLON-NEXT:    cmovnel %eax, %ecx
+; ATHLON-NEXT:    cmovnel %edx, %ecx
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; ATHLON-NEXT:    cmovnel %esi, %edx
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; ATHLON-NEXT:    cmovnel %esi, %edi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; ATHLON-NEXT:    cmovnel %esi, %ebx
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %esi
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ebp
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    cmovnel %ebp, %eax
-; ATHLON-NEXT:    movl (%ebx), %ebp
-; ATHLON-NEXT:    movl (%edi), %ebx
-; ATHLON-NEXT:    movl (%esi), %edi
-; ATHLON-NEXT:    movl (%edx), %esi
-; ATHLON-NEXT:    movl (%ecx), %edx
-; ATHLON-NEXT:    movl (%eax), %ecx
+; ATHLON-NEXT:    cmovnel %esi, %ebp
+; ATHLON-NEXT:    movl (%ebp), %ebp
 ; ATHLON-NEXT:    decl %ebp
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    movl %ebp, 20(%eax)
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; ATHLON-NEXT:    movl %ebp, 16(%esi)
+; ATHLON-NEXT:    movl (%ebx), %ebx
 ; ATHLON-NEXT:    decl %ebx
-; ATHLON-NEXT:    movl %ebx, 16(%eax)
+; ATHLON-NEXT:    movl %ebx, 12(%esi)
+; ATHLON-NEXT:    movl (%edi), %edi
 ; ATHLON-NEXT:    decl %edi
-; ATHLON-NEXT:    movl %edi, 12(%eax)
-; ATHLON-NEXT:    decl %esi
-; ATHLON-NEXT:    movl %esi, 8(%eax)
+; ATHLON-NEXT:    movl %edi, 8(%esi)
+; ATHLON-NEXT:    movl (%edx), %edx
 ; ATHLON-NEXT:    decl %edx
-; ATHLON-NEXT:    movl %edx, 4(%eax)
+; ATHLON-NEXT:    movl %edx, 4(%esi)
+; ATHLON-NEXT:    movl (%ecx), %ecx
 ; ATHLON-NEXT:    decl %ecx
-; ATHLON-NEXT:    movl %ecx, (%eax)
+; ATHLON-NEXT:    movl %ecx, 20(%esi)
+; ATHLON-NEXT:    movl (%eax), %eax
+; ATHLON-NEXT:    decl %eax
+; ATHLON-NEXT:    movl %eax, (%esi)
 ; ATHLON-NEXT:    popl %esi
 ; ATHLON-NEXT:    popl %edi
 ; ATHLON-NEXT:    popl %ebx
@@ -558,7 +553,7 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi
 ; MCU-NEXT:    testb $1, %al
 ; MCU-NEXT:    jne .LBB7_1
 ; MCU-NEXT:  # %bb.2:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; MCU-NEXT:    je .LBB7_5
 ; MCU-NEXT:  .LBB7_4:
 ; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ecx
@@ -567,13 +562,16 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi
 ; MCU-NEXT:    leal {{[0-9]+}}(%esp), %esi
 ; MCU-NEXT:    je .LBB7_11
 ; MCU-NEXT:  .LBB7_10:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %edi
 ; MCU-NEXT:    je .LBB7_14
 ; MCU-NEXT:  .LBB7_13:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; MCU-NEXT:    jmp .LBB7_15
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; MCU-NEXT:    je .LBB7_17
+; MCU-NEXT:  .LBB7_16:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; MCU-NEXT:    jmp .LBB7_18
 ; MCU-NEXT:  .LBB7_1:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; MCU-NEXT:    jne .LBB7_4
 ; MCU-NEXT:  .LBB7_5:
 ; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ecx
@@ -582,36 +580,32 @@ define void @test8(i1 %c, ptr %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwi
 ; MCU-NEXT:    leal {{[0-9]+}}(%esp), %esi
 ; MCU-NEXT:    jne .LBB7_10
 ; MCU-NEXT:  .LBB7_11:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %edi
 ; MCU-NEXT:    jne .LBB7_13
 ; MCU-NEXT:  .LBB7_14:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; MCU-NEXT:  .LBB7_15:
-; MCU-NEXT:    movl (%edi), %ebx
-; MCU-NEXT:    movl (%ecx), %edi
-; MCU-NEXT:    movl (%esi), %esi
-; MCU-NEXT:    movl (%ebp), %ecx
-; MCU-NEXT:    movl (%eax), %eax
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebx
 ; MCU-NEXT:    jne .LBB7_16
-; MCU-NEXT:  # %bb.17:
-; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
-; MCU-NEXT:    jmp .LBB7_18
-; MCU-NEXT:  .LBB7_16:
+; MCU-NEXT:  .LBB7_17:
 ; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
 ; MCU-NEXT:  .LBB7_18:
 ; MCU-NEXT:    movl (%ebp), %ebp
 ; MCU-NEXT:    decl %ebp
-; MCU-NEXT:    decl %eax
-; MCU-NEXT:    decl %ecx
-; MCU-NEXT:    decl %esi
-; MCU-NEXT:    decl %edi
+; MCU-NEXT:    movl %ebp, 16(%edx)
+; MCU-NEXT:    movl (%ebx), %ebx
 ; MCU-NEXT:    decl %ebx
-; MCU-NEXT:    movl %ebx, 20(%edx)
-; MCU-NEXT:    movl %edi, 16(%edx)
-; MCU-NEXT:    movl %esi, 12(%edx)
-; MCU-NEXT:    movl %ecx, 8(%edx)
-; MCU-NEXT:    movl %eax, 4(%edx)
-; MCU-NEXT:    movl %ebp, (%edx)
+; MCU-NEXT:    movl %ebx, 12(%edx)
+; MCU-NEXT:    movl (%edi), %edi
+; MCU-NEXT:    decl %edi
+; MCU-NEXT:    movl %edi, 8(%edx)
+; MCU-NEXT:    movl (%esi), %esi
+; MCU-NEXT:    decl %esi
+; MCU-NEXT:    movl %esi, 4(%edx)
+; MCU-NEXT:    movl (%ecx), %ecx
+; MCU-NEXT:    decl %ecx
+; MCU-NEXT:    movl %ecx, 20(%edx)
+; MCU-NEXT:    movl (%eax), %eax
+; MCU-NEXT:    decl %eax
+; MCU-NEXT:    movl %eax, (%edx)
 ; MCU-NEXT:    popl %esi
 ; MCU-NEXT:    popl %edi
 ; MCU-NEXT:    popl %ebx
@@ -637,9 +631,9 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ;
 ; ATHLON-LABEL: test9:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl $-1, %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    movl $-1, %edx
 ; ATHLON-NEXT:    je LBB8_2
 ; ATHLON-NEXT:  ## %bb.1:
@@ -677,9 +671,9 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ;
 ; ATHLON-LABEL: test9a:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl $-1, %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    movl $-1, %edx
 ; ATHLON-NEXT:    je LBB9_2
 ; ATHLON-NEXT:  ## %bb.1:
@@ -690,8 +684,9 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ;
 ; MCU-LABEL: test9a:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl %eax, %ecx
 ; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    orl %edx, %ecx
 ; MCU-NEXT:    movl $-1, %edx
 ; MCU-NEXT:    je .LBB9_2
 ; MCU-NEXT:  # %bb.1:
@@ -789,9 +784,9 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ;
 ; ATHLON-LABEL: test11:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl $-1, %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    movl $-1, %edx
 ; ATHLON-NEXT:    jne LBB12_2
 ; ATHLON-NEXT:  ## %bb.1:
@@ -828,9 +823,9 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ;
 ; ATHLON-LABEL: test11a:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl $-1, %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    movl $-1, %edx
 ; ATHLON-NEXT:    jne LBB13_2
 ; ATHLON-NEXT:  ## %bb.1:
@@ -841,8 +836,9 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ;
 ; MCU-LABEL: test11a:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl %eax, %ecx
 ; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    orl %edx, %ecx
 ; MCU-NEXT:    movl $-1, %edx
 ; MCU-NEXT:    jne .LBB13_2
 ; MCU-NEXT:  # %bb.1:
@@ -1031,8 +1027,8 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ;
 ; ATHLON-LABEL: test13:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    xorl %eax, %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    sbbl %eax, %eax
 ; ATHLON-NEXT:    retl
@@ -1068,8 +1064,8 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ;
 ; ATHLON-LABEL: test14:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    xorl %eax, %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    adcl $-1, %eax
 ; ATHLON-NEXT:    retl
@@ -1227,9 +1223,9 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 ;
 ; ATHLON-LABEL: test18:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    cmpl $15, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    cmpl $15, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    cmovll %eax, %ecx
 ; ATHLON-NEXT:    movzbl (%ecx), %eax
 ; ATHLON-NEXT:    retl
@@ -1271,9 +1267,9 @@ define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
 ;
 ; ATHLON-LABEL: trunc_select_miscompile:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    orb $2, %cl
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    shll %cl, %eax
 ; ATHLON-NEXT:    retl
 ;
@@ -1315,27 +1311,27 @@ define void @clamp_i8(i32 %src, ptr %dst) {
 ;
 ; ATHLON-LABEL: clamp_i8:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    movl $127, %eax
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    cmpl $127, %ecx
-; ATHLON-NEXT:    movl $127, %edx
-; ATHLON-NEXT:    cmovlel %ecx, %edx
-; ATHLON-NEXT:    cmpl $-128, %edx
+; ATHLON-NEXT:    cmovlel %ecx, %eax
 ; ATHLON-NEXT:    movl $128, %ecx
-; ATHLON-NEXT:    cmovgel %edx, %ecx
+; ATHLON-NEXT:    cmpl $-128, %eax
+; ATHLON-NEXT:    cmovgel %eax, %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movb %cl, (%eax)
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: clamp_i8:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    cmpl $127, %eax
 ; MCU-NEXT:    movl $127, %ecx
+; MCU-NEXT:    cmpl $127, %eax
 ; MCU-NEXT:    jg .LBB26_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %eax, %ecx
 ; MCU-NEXT:  .LBB26_2:
-; MCU-NEXT:    cmpl $-128, %ecx
 ; MCU-NEXT:    movb $-128, %al
+; MCU-NEXT:    cmpl $-128, %ecx
 ; MCU-NEXT:    jl .LBB26_4
 ; MCU-NEXT:  # %bb.3:
 ; MCU-NEXT:    movl %ecx, %eax
@@ -1377,27 +1373,27 @@ define void @clamp(i32 %src, ptr %dst) {
 ;
 ; ATHLON-LABEL: clamp:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    movl $32767, %eax ## imm = 0x7FFF
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; ATHLON-NEXT:    cmpl $32768, %ecx ## imm = 0x8000
-; ATHLON-NEXT:    movl $32767, %edx ## imm = 0x7FFF
-; ATHLON-NEXT:    cmovll %ecx, %edx
-; ATHLON-NEXT:    cmpl $-32768, %edx ## imm = 0x8000
+; ATHLON-NEXT:    cmovll %ecx, %eax
 ; ATHLON-NEXT:    movl $32768, %ecx ## imm = 0x8000
-; ATHLON-NEXT:    cmovgel %edx, %ecx
+; ATHLON-NEXT:    cmpl $-32768, %eax ## imm = 0x8000
+; ATHLON-NEXT:    cmovgel %eax, %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movw %cx, (%eax)
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: clamp:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    cmpl $32768, %eax # imm = 0x8000
 ; MCU-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; MCU-NEXT:    cmpl $32768, %eax # imm = 0x8000
 ; MCU-NEXT:    jge .LBB27_2
 ; MCU-NEXT:  # %bb.1:
 ; MCU-NEXT:    movl %eax, %ecx
 ; MCU-NEXT:  .LBB27_2:
-; MCU-NEXT:    cmpl $-32768, %ecx # imm = 0x8000
 ; MCU-NEXT:    movl $32768, %eax # imm = 0x8000
+; MCU-NEXT:    cmpl $-32768, %ecx # imm = 0x8000
 ; MCU-NEXT:    jl .LBB27_4
 ; MCU-NEXT:  # %bb.3:
 ; MCU-NEXT:    movl %ecx, %eax
@@ -2062,13 +2058,13 @@ define i64 @PR51612(i64 %x, i64 %y) {
 ;
 ; ATHLON-LABEL: PR51612:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    incl %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    incl %edx
 ; ATHLON-NEXT:    addl $1, %eax
-; ATHLON-NEXT:    adcl $0, %ecx
-; ATHLON-NEXT:    cmovbl %edx, %eax
+; ATHLON-NEXT:    adcl $0, %edx
+; ATHLON-NEXT:    cmovbl %ecx, %eax
 ; ATHLON-NEXT:    andl 10, %eax
 ; ATHLON-NEXT:    xorl %edx, %edx
 ; ATHLON-NEXT:    retl
@@ -2125,12 +2121,12 @@ define i8 @select_uaddo_common_op0(i8 %a, i8 %b, i8 %c, i1 %cond) {
 ;
 ; ATHLON-LABEL: select_uaddo_common_op0:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; ATHLON-NEXT:    cmovnel %ecx, %edx
-; ATHLON-NEXT:    addb (%edx), %al
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; ATHLON-NEXT:    cmovnel %eax, %ecx
+; ATHLON-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    addb (%ecx), %al
 ; ATHLON-NEXT:    retl
 ;
 ; MCU-LABEL: select_uaddo_common_op0:
@@ -2174,9 +2170,9 @@ define i32 @select_uaddo_common_op1(i32 %a, i32 %b, i32 %c, i1 %cond) {
 ;
 ; ATHLON-LABEL: select_uaddo_common_op1:
 ; ATHLON:       ## %bb.0:
-; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; ATHLON-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; ATHLON-NEXT:    cmovnel %eax, %ecx
 ; ATHLON-NEXT:    movl (%ecx), %eax
 ; ATHLON-NEXT:    addl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index a7da07f1ae5df..30157179c58d1 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -347,8 +347,8 @@ define i32 @select_lea_2(i1 zeroext %cond) {
 define i64 @select_lea_3(i1 zeroext %cond) {
 ; X86-LABEL: select_lea_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-2, %eax
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB19_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    movl $-1, %edx
@@ -395,8 +395,8 @@ define i32 @select_lea_5(i1 zeroext %cond) {
 define i64 @select_lea_9(i1 zeroext %cond) {
 ; X86-LABEL: select_lea_9:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-7, %eax
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB21_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    movl $-1, %edx
@@ -608,8 +608,8 @@ define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
 define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) {
 ; X86-LABEL: select_pow2_diff_neg_invert:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-99, %eax
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB30_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    movl $-1, %edx
@@ -640,8 +640,8 @@ define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) {
 define i8 @sel_67_neg125(i32 %x) {
 ; X86-LABEL: sel_67_neg125:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $43, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movb $67, %al
+; X86-NEXT:    cmpl $43, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jge .LBB31_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movb $-125, %al
@@ -689,8 +689,8 @@ define i32 @select_C1_C2(i1 %cond) {
 define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
 ; X86-LABEL: select_C1_C2_zeroext:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $421, %eax # imm = 0x1A5
+; X86-NEXT:    cmpb $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB33_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl $42, %eax
@@ -759,15 +759,15 @@ define i64 @select_2_or_inc(i64 %x) {
 ; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    .cfi_offset %edi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    xorl $2, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    addl $1, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    xorl $2, %esi
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    je .LBB36_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl %eax, %edx
@@ -798,35 +798,25 @@ define i64 @select_2_or_inc(i64 %x) {
 define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
 ; X86-LABEL: sel_constants_add_constant_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %edi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $1, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl $-15, %edx
-; X86-NEXT:    orl $12, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    xorl $13, %esi
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl $10, %edi
-; X86-NEXT:    xorl $14, %edi
-; X86-NEXT:    andl $11, %ecx
-; X86-NEXT:    xorl $15, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    andl $11, %edx
+; X86-NEXT:    xorl $15, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $10, %edx
+; X86-NEXT:    xorl $14, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $3, %edx
+; X86-NEXT:    xorl $13, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    andl $-15, %ecx
+; X86-NEXT:    orl $12, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: sel_constants_add_constant_vec:
@@ -847,9 +837,9 @@ define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
 define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
 ; X86-LABEL: sel_constants_fmul_constant_vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
 ; X86-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB38_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    fstp %st(1)
@@ -887,14 +877,13 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
 define i64 @opaque_constant(i1 %cond, i64 %x) {
 ; X86-LABEL: opaque_constant:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl $1, %edx
+; X86-NEXT:    xorl $1, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    sete %cl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    andl $1, %edx
@@ -902,17 +891,8 @@ define i64 @opaque_constant(i1 %cond, i64 %x) {
 ; X86-NEXT:    andl $1, %edx
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    xorl $1, %esi
-; X86-NEXT:    xorl $1, %ecx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    sete %bl
-; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opaque_constant:
@@ -1010,8 +990,8 @@ define i32 @select_ult9_7_6(i32 %X) {
 define i32 @select_ult2_2_3(i32 %X) {
 ; X86-LABEL: select_ult2_2_3:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $2, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $3, %eax
+; X86-NEXT:    cmpl $2, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    retl
 ;
@@ -1029,8 +1009,8 @@ define i32 @select_ult2_2_3(i32 %X) {
 define i32 @select_ugt3_3_2(i32 %X) {
 ; X86-LABEL: select_ugt3_3_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $4, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $2, %eax
+; X86-NEXT:    cmpl $4, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl $-1, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll
index 60ac6df3f77af..816628c0aff73 100644
--- a/llvm/test/CodeGen/X86/setcc.ll
+++ b/llvm/test/CodeGen/X86/setcc.ll
@@ -76,8 +76,8 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
 define i32 @t4(i32 %a) {
 ; X86-LABEL: t4:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl L_v4$non_lazy_ptr, %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl L_v4$non_lazy_ptr, %ecx
 ; X86-NEXT:    cmpl $1, (%ecx)
 ; X86-NEXT:    adcw $1, %ax
 ; X86-NEXT:    shll $16, %eax
@@ -186,7 +186,7 @@ define i64 @t9(i32 %0, i32 %1) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    btl %edx, %ecx
+; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
@@ -216,7 +216,7 @@ define i32 @t10(i32 %0, i32 %1) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    btl %edx, %ecx
+; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
@@ -238,7 +238,7 @@ define i32 @t11(i32 %0, i32 %1) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    btl %edx, %ecx
+; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
@@ -260,7 +260,7 @@ define i32 @t12(i32 %0, i32 %1) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    btl %edx, %ecx
+; X86-NEXT:    btl %ecx, %edx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/sext-i1.ll b/llvm/test/CodeGen/X86/sext-i1.ll
index 7a906aed1a42e..0c0076fbf2cb2 100644
--- a/llvm/test/CodeGen/X86/sext-i1.ll
+++ b/llvm/test/CodeGen/X86/sext-i1.ll
@@ -46,12 +46,12 @@ define i32 @t2(i32 %x) nounwind readnone ssp {
 define i32 @t3(i32 %x, i64 %y) nounwind readonly {
 ; X86-LABEL: t3:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll
index 9f7ac748c47e1..86e85b3cee2d8 100644
--- a/llvm/test/CodeGen/X86/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll
@@ -12,9 +12,9 @@
 define i32 @reg32_shl_by_negated(i32 %val, i32 %shamt) nounwind {
 ; X86-LABEL: reg32_shl_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    retl
@@ -58,13 +58,13 @@ define i32 @load32_shl_by_negated(ptr %valptr, i32 %shamt) nounwind {
 define void @store32_shl_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind {
 ; X86-LABEL: store32_shl_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store32_shl_by_negated:
@@ -83,9 +83,9 @@ define void @store32_shl_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind
 define void @modify32_shl_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; X86-LABEL: modify32_shl_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $32, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll %cl, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -106,10 +106,10 @@ define i64 @reg64_shl_by_negated(i64 %val, i64 %shamt) nounwind {
 ; X86-LABEL: reg64_shl_by_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    shldl %cl, %esi, %edx
@@ -138,11 +138,11 @@ define i64 @load64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; X86-LABEL: load64_shl_by_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movb $64, %cl
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    movb $64, %cl
-; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    shldl %cl, %esi, %edx
@@ -173,22 +173,27 @@ define void @store64_shl_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB6_2
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    jne .LBB6_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, 4(%ecx)
+; X86-NEXT:    jne .LBB6_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB6_4:
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -211,21 +216,24 @@ define void @modify64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edi
-; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB7_2
+; X86-NEXT:    jne .LBB7_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:  .LBB7_2:
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    jne .LBB7_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB7_4:
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -250,9 +258,9 @@ define void @modify64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define i32 @reg32_lshr_by_negated(i32 %val, i32 %shamt) nounwind {
 ; X86-LABEL: reg32_lshr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -296,13 +304,13 @@ define i32 @load32_lshr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 define void @store32_lshr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind {
 ; X86-LABEL: store32_lshr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store32_lshr_by_negated:
@@ -321,9 +329,9 @@ define void @store32_lshr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind
 define void @modify32_lshr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; X86-LABEL: modify32_lshr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $32, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl %cl, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -344,10 +352,10 @@ define i64 @reg64_lshr_by_negated(i64 %val, i64 %shamt) nounwind {
 ; X86-LABEL: reg64_lshr_by_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -376,11 +384,11 @@ define i64 @load64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; X86-LABEL: load64_lshr_by_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %esi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -409,26 +417,27 @@ define i64 @load64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define void @store64_lshr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind {
 ; X86-LABEL: store64_lshr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB14_2
+; X86-NEXT:    jne .LBB14_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB14_2:
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    jne .LBB14_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:  .LBB14_4:
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store64_lshr_by_negated:
@@ -449,22 +458,27 @@ define void @modify64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %edi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    movl 4(%eax), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB15_2
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    jne .LBB15_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:  .LBB15_2:
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    jne .LBB15_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:  .LBB15_4:
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -488,9 +502,9 @@ define void @modify64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define i32 @reg32_ashr_by_negated(i32 %val, i32 %shamt) nounwind {
 ; X86-LABEL: reg32_ashr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    retl
@@ -534,13 +548,13 @@ define i32 @load32_ashr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 define void @store32_ashr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind {
 ; X86-LABEL: store32_ashr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    sarl %cl, %edx
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sarl %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store32_ashr_by_negated:
@@ -559,9 +573,9 @@ define void @store32_ashr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind
 define void @modify32_ashr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; X86-LABEL: modify32_ashr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $32, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl %cl, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -582,10 +596,10 @@ define i64 @reg64_ashr_by_negated(i64 %val, i64 %shamt) nounwind {
 ; X86-LABEL: reg64_ashr_by_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -615,11 +629,11 @@ define i64 @load64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; X86-LABEL: load64_ashr_by_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %esi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -649,27 +663,28 @@ define i64 @load64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define void @store64_ashr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind {
 ; X86-LABEL: store64_ashr_by_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sarl %cl, %esi
-; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shrdl %cl, %esi, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl %cl, %edx
+; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB22_2
+; X86-NEXT:    jne .LBB22_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:  .LBB22_2:
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    jne .LBB22_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB22_4:
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store64_ashr_by_negated:
@@ -690,23 +705,27 @@ define void @modify64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %edi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edi
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    sarl %cl, %esi
-; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    shrdl %cl, %edx, %edi
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB23_2
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    jne .LBB23_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:  .LBB23_2:
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    jne .LBB23_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB23_4:
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -734,10 +753,10 @@ define void @modify64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define i32 @reg32_lshr_by_sub_from_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_from_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -762,11 +781,11 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $64, %cl
-; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -799,9 +818,9 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind {
 define i32 @reg32_lshr_by_sub_of_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_of_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -824,11 +843,11 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-LABEL: reg64_lshr_by_sub_of_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addb $-64, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -861,9 +880,9 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind {
 define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_add_to_negated:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -885,11 +904,11 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-LABEL: reg64_lshr_by_add_to_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addb $64, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -922,9 +941,9 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind {
 define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_of_negated_amts:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -947,10 +966,10 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X86-LABEL: reg64_lshr_by_sub_of_negated_amts:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -984,10 +1003,10 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 define i32 @reg32_lshr_by_add_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_add_of_negated_amts:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1013,11 +1032,11 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $-128, %cl
-; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1051,9 +1070,9 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 define i32 @reg32_lshr_by_negated_unfolded(i32 %val, i32 %shamt) nounwind {
 ; X86-LABEL: reg32_lshr_by_negated_unfolded:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1075,10 +1094,10 @@ define i64 @reg64_lshr_by_negated_unfolded(i64 %val, i64 %shamt) nounwind {
 ; X86-LABEL: reg64_lshr_by_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1108,10 +1127,10 @@ define i64 @reg64_lshr_by_negated_unfolded(i64 %val, i64 %shamt) nounwind {
 define i32 @reg32_lshr_by_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_negated_unfolded_sub_b:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1137,11 +1156,11 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $64, %cl
-; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1172,9 +1191,9 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw
 define i32 @reg32_lshr_by_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1198,11 +1217,11 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw
 ; X86-LABEL: reg64_lshr_by_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addb $-64, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1232,9 +1251,9 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw
 define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_negated_unfolded_add_b:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1257,11 +1276,11 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw
 ; X86-LABEL: reg64_lshr_by_negated_unfolded_add_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addb $64, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1295,9 +1314,9 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw
 define i32 @reg32_lshr_by_masked_negated_unfolded(i32 %val, i32 %shamt) nounwind {
 ; X86-LABEL: reg32_lshr_by_masked_negated_unfolded:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1319,11 +1338,11 @@ define i64 @reg64_lshr_by_masked_negated_unfolded(i64 %val, i64 %shamt) nounwind
 ; X86-LABEL: reg64_lshr_by_masked_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1353,11 +1372,11 @@ define i64 @reg64_lshr_by_masked_negated_unfolded(i64 %val, i64 %shamt) nounwind
 define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_masked_negated_unfolded_sub_b:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1382,12 +1401,12 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b
 ; X86-LABEL: reg64_lshr_by_masked_negated_unfolded_sub_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $63, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1420,12 +1439,12 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b
 define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_masked_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $31, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $31, %edx
-; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1450,13 +1469,13 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b
 ; X86-LABEL: reg64_lshr_by_masked_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $63, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $63, %edx
-; X86-NEXT:    subl %edx, %ecx
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1489,11 +1508,11 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b
 define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_masked_negated_unfolded_add_b:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $31, %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1519,12 +1538,12 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
 ; X86-LABEL: reg64_lshr_by_masked_negated_unfolded_add_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl $63, %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll
index a467720fbbe61..5b6531ed6854d 100644
--- a/llvm/test/CodeGen/X86/shift-and.ll
+++ b/llvm/test/CodeGen/X86/shift-and.ll
@@ -131,21 +131,26 @@ define void @t5ptr(i64 %t, ptr %ptr) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%eax), %esi
 ; X86-NEXT:    movl 4(%eax), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    shrdl %cl, %edi, %esi
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB5_2
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    jne .LBB5_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:  .LBB5_2:
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    jne .LBB5_4
+; X86-NEXT:  # %bb.3:
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:  .LBB5_4:
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
@@ -168,21 +173,19 @@ define void @t5ptr(i64 %t, ptr %ptr) nounwind {
 define i64 @t6(i64 %key, ptr nocapture %val) nounwind {
 ; X86-LABEL: t6:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $3, %edi, %esi
-; X86-NEXT:    shrl $3, %edi
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edx
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    andl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shrdl $3, %edx, %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl %esi, %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t6:
diff --git a/llvm/test/CodeGen/X86/shift-bmi2.ll b/llvm/test/CodeGen/X86/shift-bmi2.ll
index bb0213891c976..5b4d18de74987 100644
--- a/llvm/test/CodeGen/X86/shift-bmi2.ll
+++ b/llvm/test/CodeGen/X86/shift-bmi2.ll
@@ -48,9 +48,9 @@ define i32 @shl32i(i32 %x) nounwind uwtable readnone {
 define i32 @shl32p(ptr %p, i32 %shamt) nounwind uwtable readnone {
 ; BMI2-LABEL: shl32p:
 ; BMI2:       # %bb.0:
-; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; BMI2-NEXT:    shlxl %ecx, (%eax), %eax
+; BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; BMI2-NEXT:    shlxl %eax, (%ecx), %eax
 ; BMI2-NEXT:    retl
 ;
 ; BMI264-LABEL: shl32p:
@@ -230,9 +230,9 @@ define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
 define i32 @lshr32p(ptr %p, i32 %shamt) nounwind uwtable readnone {
 ; BMI2-LABEL: lshr32p:
 ; BMI2:       # %bb.0:
-; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; BMI2-NEXT:    shrxl %ecx, (%eax), %eax
+; BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; BMI2-NEXT:    retl
 ;
 ; BMI264-LABEL: lshr32p:
@@ -338,9 +338,9 @@ define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
 define i32 @ashr32p(ptr %p, i32 %shamt) nounwind uwtable readnone {
 ; BMI2-LABEL: ashr32p:
 ; BMI2:       # %bb.0:
-; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; BMI2-NEXT:    sarxl %ecx, (%eax), %eax
+; BMI2-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; BMI2-NEXT:    sarxl %eax, (%ecx), %eax
 ; BMI2-NEXT:    retl
 ;
 ; BMI264-LABEL: ashr32p:
diff --git a/llvm/test/CodeGen/X86/shift-by-signext.ll b/llvm/test/CodeGen/X86/shift-by-signext.ll
index 97a4318c0720f..4fe9aea6f83ad 100644
--- a/llvm/test/CodeGen/X86/shift-by-signext.ll
+++ b/llvm/test/CodeGen/X86/shift-by-signext.ll
@@ -88,9 +88,9 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
 define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; X86-LABEL: n6_fshl:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shldl %cl, %edx, %eax
 ; X86-NEXT:    retl
 ;
@@ -108,9 +108,9 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; X86-LABEL: n7_fshr:
 ; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrdl %cl, %edx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/shift-coalesce.ll b/llvm/test/CodeGen/X86/shift-coalesce.ll
index e01f56d643df0..28f1055f9b9dd 100644
--- a/llvm/test/CodeGen/X86/shift-coalesce.ll
+++ b/llvm/test/CodeGen/X86/shift-coalesce.ll
@@ -9,10 +9,10 @@ define i64 @foo(i64 %x, ptr %X) {
 ; CHECK-NEXT:    push esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    .cfi_offset esi, -8
-; CHECK-NEXT:    mov esi, dword ptr [esp + 8]
-; CHECK-NEXT:    mov edx, dword ptr [esp + 12]
 ; CHECK-NEXT:    mov eax, dword ptr [esp + 16]
 ; CHECK-NEXT:    movzx ecx, byte ptr [eax]
+; CHECK-NEXT:    mov esi, dword ptr [esp + 8]
+; CHECK-NEXT:    mov edx, dword ptr [esp + 12]
 ; CHECK-NEXT:    mov eax, esi
 ; CHECK-NEXT:    shl eax, cl
 ; CHECK-NEXT:    shld edx, esi, cl
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index dfeef48897e06..e99526e552afb 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -200,27 +200,23 @@ define i64 @ashr_add_shl_i8(i64 %r) nounwind {
 define <4 x i32> @ashr_add_shl_v4i8(<4 x i32> %r) nounwind {
 ; X86-LABEL: ashr_add_shl_v4i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    incb %al
+; X86-NEXT:    movsbl %al, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    incb %dh
-; X86-NEXT:    movsbl %dh, %esi
-; X86-NEXT:    incb %ch
-; X86-NEXT:    movsbl %ch, %edi
-; X86-NEXT:    incb %dl
-; X86-NEXT:    movsbl %dl, %edx
 ; X86-NEXT:    incb %cl
 ; X86-NEXT:    movsbl %cl, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    incb %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    incb %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: ashr_add_shl_v4i8:
@@ -306,10 +302,10 @@ define i64 @ashr_add_shl_mismatch_shifts2(i64 %r) nounwind {
 define dso_local i32 @ashr_add_shl_i32_i8_extra_use1(i32 %r, ptr %p) nounwind {
 ; X86-LABEL: ashr_add_shl_i32_i8_extra_use1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $24, %eax
 ; X86-NEXT:    addl $33554432, %eax # imm = 0x2000000
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    sarl $24, %eax
 ; X86-NEXT:    retl
@@ -332,9 +328,9 @@ define dso_local i32 @ashr_add_shl_i32_i8_extra_use1(i32 %r, ptr %p) nounwind {
 define dso_local i32 @ashr_add_shl_i32_i8_extra_use2(i32 %r, ptr %p) nounwind {
 ; X86-LABEL: ashr_add_shl_i32_i8_extra_use2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $24, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    addl $33554432, %eax # imm = 0x2000000
 ; X86-NEXT:    sarl $24, %eax
@@ -358,12 +354,12 @@ define dso_local i32 @ashr_add_shl_i32_i8_extra_use2(i32 %r, ptr %p) nounwind {
 define dso_local i32 @ashr_add_shl_i32_i8_extra_use3(i32 %r, ptr %p1, ptr %p2) nounwind {
 ; X86-LABEL: ashr_add_shl_i32_i8_extra_use3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $24, %eax
-; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    addl $33554432, %eax # imm = 0x2000000
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    sarl $24, %eax
 ; X86-NEXT:    retl
@@ -465,27 +461,23 @@ define i64 @ashr_add_neg_shl_i8(i64 %r) nounwind {
 define <4 x i32> @ashr_add_neg_shl_v4i8(<4 x i32> %r) nounwind {
 ; X86-LABEL: ashr_add_neg_shl_v4i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movb $1, %dl
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movsbl %dl, %edx
-; X86-NEXT:    movb $1, %ch
-; X86-NEXT:    subb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movsbl %ch, %esi
-; X86-NEXT:    movb $1, %ch
-; X86-NEXT:    subb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movsbl %ch, %edi
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movb $1, %dl
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movsbl %dl, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movsbl %cl, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: ashr_add_neg_shl_v4i8:
@@ -557,9 +549,9 @@ define i32 @xor_tree_with_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 define i32 @and_tree_with_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; X86-LABEL: and_tree_with_shifts_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    retl
@@ -583,9 +575,9 @@ define i32 @and_tree_with_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 define i32 @logic_tree_with_shifts_var_i32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %s) {
 ; X86-LABEL: logic_tree_with_shifts_var_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
@@ -611,11 +603,11 @@ define i32 @logic_tree_with_shifts_var_i32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %
 define i32 @logic_tree_with_mismatching_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; X86-LABEL: logic_tree_with_mismatching_shifts_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $15, %ecx
-; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $15, %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
@@ -640,10 +632,10 @@ define i32 @logic_tree_with_mismatching_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %
 define i32 @logic_tree_with_mismatching_shifts2_i32(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; X86-LABEL: logic_tree_with_mismatching_shifts2_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
@@ -668,41 +660,31 @@ define i32 @logic_tree_with_mismatching_shifts2_i32(i32 %a, i32 %b, i32 %c, i32
 define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; X86-LABEL: or_tree_with_shifts_vec_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shll $16, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shll $16, %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shll $16, %edi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: or_tree_with_shifts_vec_i32:
@@ -723,49 +705,39 @@ define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i
 define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; X86-LABEL: or_tree_with_mismatching_shifts_vec_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %edi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    shll $17, %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    shll $17, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    shll $17, %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    shll $17, %edi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $17, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: or_tree_with_mismatching_shifts_vec_i32:
diff --git a/llvm/test/CodeGen/X86/shift-double.ll b/llvm/test/CodeGen/X86/shift-double.ll
index 5a2028216033c..22e8cda0a92ec 100644
--- a/llvm/test/CodeGen/X86/shift-double.ll
+++ b/llvm/test/CodeGen/X86/shift-double.ll
@@ -128,10 +128,10 @@ define i32 @test4(i32 %A, i32 %B, i8 %C) nounwind {
 define i16 @test5(i16 %A, i16 %B, i8 %C) nounwind {
 ; X86-LABEL: test5:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shldw %cl, %dx, %ax
 ; X86-NEXT:    retl
 ;
@@ -183,10 +183,10 @@ define i32 @test6(i32 %A, i32 %B, i8 %C) nounwind {
 define i16 @test7(i16 %A, i16 %B, i8 %C) nounwind {
 ; X86-LABEL: test7:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrdw %cl, %dx, %ax
 ; X86-NEXT:    retl
 ;
@@ -487,11 +487,11 @@ define i32 @test18(i32 %hi, i32 %lo, i32 %bits) nounwind {
 define i32 @not_shld_i32(i32, i32, i32) {
 ; X86-LABEL: not_shld_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    retl
@@ -518,11 +518,11 @@ define i32 @not_shld_i32(i32, i32, i32) {
 define i32 @not_shrd_i32(i32, i32, i32) {
 ; X86-LABEL: not_shrd_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    negb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/shift-folding.ll b/llvm/test/CodeGen/X86/shift-folding.ll
index c4be7d990cbaa..76a8a6320b8f6 100644
--- a/llvm/test/CodeGen/X86/shift-folding.ll
+++ b/llvm/test/CodeGen/X86/shift-folding.ll
@@ -55,9 +55,9 @@ define fastcc i32 @test4(ptr %d) {
 define i64 @test5(i16 %i, ptr %arr) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    shrl $11, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    addl (%ecx,%eax,4), %eax
 ; CHECK-NEXT:    setb %dl
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 049ee47af9681..ce287187903b8 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -16,38 +16,40 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
 ; i686-NEXT:    subl $48, %esp
-; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 16(%ebp), %eax
+; i686-NEXT:    movl 20(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 8(%ebp), %eax
-; i686-NEXT:    movl 12(%ebp), %edx
-; i686-NEXT:    movl 16(%ebp), %esi
-; i686-NEXT:    movl 20(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, (%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 24(%ebp), %ecx
 ; i686-NEXT:    movl %ecx, %eax
 ; i686-NEXT:    shrb $3, %al
 ; i686-NEXT:    andb $12, %al
-; i686-NEXT:    movzbl %al, %edi
-; i686-NEXT:    movl 8(%esp,%edi), %eax
-; i686-NEXT:    movl 4(%esp,%edi), %ebx
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    shrdl %cl, %eax, %edx
-; i686-NEXT:    movl (%esp,%edi), %esi
-; i686-NEXT:    movl 12(%esp,%edi), %edi
-; i686-NEXT:    shrdl %cl, %edi, %eax
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    movl 40(%ebp), %ebx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movzbl %al, %edx
+; i686-NEXT:    movl 12(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, %edi
 ; i686-NEXT:    shrl %cl, %edi
-; i686-NEXT:    movl %edi, 12(%ebx)
-; i686-NEXT:    movl %eax, 8(%ebx)
-; i686-NEXT:    movl %edx, 4(%ebx)
-; i686-NEXT:    movl %esi, (%ebx)
+; i686-NEXT:    movl 40(%ebp), %eax
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl 8(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %ebx, 8(%eax)
+; i686-NEXT:    movl (%esp,%edx), %esi
+; i686-NEXT:    movl 4(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, 4(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, (%eax)
 ; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -83,39 +85,41 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
 ; i686-NEXT:    subl $48, %esp
-; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 16(%ebp), %eax
+; i686-NEXT:    movl 20(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 8(%ebp), %eax
 ; i686-NEXT:    movl 12(%ebp), %edx
-; i686-NEXT:    movl 16(%ebp), %esi
-; i686-NEXT:    movl 20(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, (%esp)
-; i686-NEXT:    sarl $31, %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 24(%ebp), %ecx
 ; i686-NEXT:    movl %ecx, %eax
 ; i686-NEXT:    shrb $3, %al
 ; i686-NEXT:    andb $12, %al
-; i686-NEXT:    movzbl %al, %edi
-; i686-NEXT:    movl 8(%esp,%edi), %eax
-; i686-NEXT:    movl 4(%esp,%edi), %ebx
-; i686-NEXT:    movl %ebx, %edx
-; i686-NEXT:    shrdl %cl, %eax, %edx
-; i686-NEXT:    movl (%esp,%edi), %esi
-; i686-NEXT:    movl 12(%esp,%edi), %edi
-; i686-NEXT:    shrdl %cl, %edi, %eax
-; i686-NEXT:    shrdl %cl, %ebx, %esi
-; i686-NEXT:    movl 40(%ebp), %ebx
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movzbl %al, %edx
+; i686-NEXT:    movl 12(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, %edi
 ; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    movl %edi, 12(%ebx)
-; i686-NEXT:    movl %eax, 8(%ebx)
-; i686-NEXT:    movl %edx, 4(%ebx)
-; i686-NEXT:    movl %esi, (%ebx)
+; i686-NEXT:    movl 40(%ebp), %eax
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl 8(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %ebx, 8(%eax)
+; i686-NEXT:    movl (%esp,%edx), %esi
+; i686-NEXT:    movl 4(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, 4(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, (%eax)
 ; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -147,49 +151,45 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    movl %esp, %ebp
-; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $48, %esp
-; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    subl $32, %esp
+; i686-NEXT:    movl 16(%ebp), %eax
+; i686-NEXT:    movl 20(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 8(%ebp), %eax
-; i686-NEXT:    movl 12(%ebp), %edx
-; i686-NEXT:    movl 16(%ebp), %esi
-; i686-NEXT:    movl 20(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, (%esp)
+; i686-NEXT:    movl 24(%ebp), %ecx
 ; i686-NEXT:    movl %ecx, %eax
 ; i686-NEXT:    shrb $3, %al
 ; i686-NEXT:    andb $12, %al
 ; i686-NEXT:    negb %al
-; i686-NEXT:    movsbl %al, %edi
-; i686-NEXT:    movl 20(%esp,%edi), %eax
-; i686-NEXT:    movl 24(%esp,%edi), %ebx
-; i686-NEXT:    movl %ebx, %esi
-; i686-NEXT:    shldl %cl, %eax, %esi
-; i686-NEXT:    movl 16(%esp,%edi), %edx
-; i686-NEXT:    movl 28(%esp,%edi), %edi
-; i686-NEXT:    shldl %cl, %ebx, %edi
-; i686-NEXT:    movl 40(%ebp), %ebx
-; i686-NEXT:    movl %edi, 12(%ebx)
-; i686-NEXT:    movl %esi, 8(%ebx)
-; i686-NEXT:    movl %edx, %esi
-; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    movsbl %al, %esi
+; i686-NEXT:    movl 24(%esp,%esi), %edi
+; i686-NEXT:    movl 28(%esp,%esi), %edx
+; i686-NEXT:    shldl %cl, %edi, %edx
+; i686-NEXT:    movl 40(%ebp), %eax
+; i686-NEXT:    movl %edx, 12(%eax)
+; i686-NEXT:    movl 16(%esp,%esi), %edx
+; i686-NEXT:    movl 20(%esp,%esi), %esi
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, 8(%eax)
+; i686-NEXT:    shldl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 4(%eax)
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edx, %eax
-; i686-NEXT:    movl %eax, 4(%ebx)
-; i686-NEXT:    movl %esi, (%ebx)
-; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    movl %edx, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    popl %ebx
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    retl
 ;
@@ -281,87 +281,74 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $112, %esp
-; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    subl $80, %esp
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 24(%ebp), %eax
 ; i686-NEXT:    movl 28(%ebp), %ecx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl 20(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 16(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 12(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 36(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, %ebx
-; i686-NEXT:    andl $31, %ebx
+; i686-NEXT:    movl 56(%ebp), %edx
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 40(%esp,%edx), %eax
-; i686-NEXT:    movl 36(%esp,%edx), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrdl %cl, %eax, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 32(%esp,%edx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 44(%esp,%edx), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl %ebx, %esi
-; i686-NEXT:    shrdl %cl, %edx, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 44(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    shrl %cl, %edi
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edi, 28(%eax)
+; i686-NEXT:    movl 40(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %ebx, 24(%eax)
+; i686-NEXT:    movl 32(%esp,%edx), %esi
+; i686-NEXT:    movl 36(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, 20(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 16(%eax)
+; i686-NEXT:    movl 16(%ebp), %ecx
+; i686-NEXT:    movl 20(%ebp), %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %ecx
+; i686-NEXT:    movl 12(%ebp), %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, (%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 56(%ebp), %edx
-; i686-NEXT:    movl %edx, %eax
-; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 72(%esp,%edx), %ebx
-; i686-NEXT:    movl 68(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebx, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 64(%esp,%edx), %edi
-; i686-NEXT:    movl 76(%esp,%edx), %edx
-; i686-NEXT:    shrdl %cl, %edx, %ebx
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl 12(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    shrl %cl, %edi
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl 8(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %ebx, 8(%eax)
+; i686-NEXT:    movl (%esp,%edx), %esi
+; i686-NEXT:    movl 4(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, 4(%eax)
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    shrl %cl, %edx
-; i686-NEXT:    movl 72(%ebp), %eax
-; i686-NEXT:    movl %edx, 28(%eax)
-; i686-NEXT:    movl %ebx, 24(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %edi, 16(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, (%eax)
 ; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -408,90 +395,76 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $112, %esp
-; i686-NEXT:    movl 40(%ebp), %edx
-; i686-NEXT:    movl 24(%ebp), %eax
-; i686-NEXT:    movl 28(%ebp), %ecx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl 16(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 12(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 20(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 36(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    subl $80, %esp
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %ecx
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, %eax
-; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 56(%ebp), %edx
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 40(%esp,%edx), %esi
-; i686-NEXT:    movl 36(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 32(%esp,%edx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 44(%esp,%edx), %edx
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 56(%ebp), %edx
+; i686-NEXT:    movl 44(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    sarl %cl, %edi
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edi, 28(%eax)
+; i686-NEXT:    movl 40(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %ebx, 24(%eax)
+; i686-NEXT:    movl 32(%esp,%edx), %esi
+; i686-NEXT:    movl 36(%esp,%edx), %edx
 ; i686-NEXT:    movl %edx, %ebx
-; i686-NEXT:    andl $31, %ebx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, 20(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 16(%eax)
+; i686-NEXT:    movl 16(%ebp), %ecx
+; i686-NEXT:    movl 20(%ebp), %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %ecx
+; i686-NEXT:    movl 12(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, (%esp)
+; i686-NEXT:    sarl $31, %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 40(%ebp), %edx
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 72(%esp,%edx), %esi
-; i686-NEXT:    movl 68(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 64(%esp,%edx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 76(%esp,%edx), %edx
-; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    movl 12(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    sarl %cl, %edi
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl 8(%esp,%edx), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %ebx
+; i686-NEXT:    movl %ebx, 8(%eax)
+; i686-NEXT:    movl (%esp,%edx), %esi
+; i686-NEXT:    movl 4(%esp,%edx), %edx
+; i686-NEXT:    movl %edx, %ebx
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, 4(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    sarl %cl, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, %edi
-; i686-NEXT:    sarl %cl, %edx
-; i686-NEXT:    movl 72(%ebp), %eax
-; i686-NEXT:    movl %edx, 28(%eax)
-; i686-NEXT:    movl %esi, 24(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %edi, 16(%eax)
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    movl %esi, (%eax)
 ; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -537,107 +510,80 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686:       # %bb.0: # %entry
 ; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    movl %esp, %ebp
-; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $128, %esp
-; i686-NEXT:    movl 40(%ebp), %edi
+; i686-NEXT:    subl $64, %esp
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 24(%ebp), %eax
 ; i686-NEXT:    movl 28(%ebp), %ecx
-; i686-NEXT:    movl 32(%ebp), %edx
-; i686-NEXT:    movl 20(%ebp), %esi
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 16(%ebp), %esi
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 12(%ebp), %esi
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%ebp), %esi
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 36(%ebp), %esi
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, %ebx
-; i686-NEXT:    shrl $3, %ebx
-; i686-NEXT:    andl $12, %ebx
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    subl %ebx, %eax
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrl $3, %eax
+; i686-NEXT:    andl $12, %eax
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    subl %eax, %edx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%eax), %esi
-; i686-NEXT:    movl 4(%eax), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 8(%eax), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    movl 8(%edx), %esi
 ; i686-NEXT:    andl $31, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    negl %eax
+; i686-NEXT:    movl 60(%esp,%eax), %edi
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl 72(%ebp), %eax
+; i686-NEXT:    movl %edi, 28(%eax)
+; i686-NEXT:    movl (%edx), %edi
+; i686-NEXT:    movl 4(%edx), %edx
+; i686-NEXT:    shldl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 24(%eax)
+; i686-NEXT:    shldl %cl, %edi, %edx
+; i686-NEXT:    movl %edx, 20(%eax)
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edx, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 56(%ebp), %eax
-; i686-NEXT:    movl %eax, %edx
-; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    subl %edx, %ecx
-; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    movl %edi, 16(%eax)
+; i686-NEXT:    movl 16(%ebp), %ecx
+; i686-NEXT:    movl 20(%ebp), %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %ecx
+; i686-NEXT:    movl 12(%ebp), %edx
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 40(%ebp), %ecx
+; i686-NEXT:    movl %ecx, %esi
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $12, %esi
+; i686-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    subl %esi, %edx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%ecx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 4(%ecx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 8(%ecx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    andl $31, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl $0, (%esp)
+; i686-NEXT:    movl 8(%edx), %edi
+; i686-NEXT:    andl $31, %ecx
+; i686-NEXT:    negl %esi
+; i686-NEXT:    movl 28(%esp,%esi), %esi
+; i686-NEXT:    shldl %cl, %edi, %esi
+; i686-NEXT:    movl %esi, 12(%eax)
+; i686-NEXT:    movl (%edx), %esi
+; i686-NEXT:    movl 4(%edx), %edx
+; i686-NEXT:    shldl %cl, %edx, %edi
+; i686-NEXT:    movl %edi, 8(%eax)
+; i686-NEXT:    shldl %cl, %esi, %edx
+; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edi, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, %eax
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    negl %ebx
-; i686-NEXT:    movl 76(%esp,%ebx), %ebx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shldl %cl, %esi, %ebx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    negl %edx
-; i686-NEXT:    movl 108(%esp,%edx), %edx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shldl %cl, %eax, %edx
-; i686-NEXT:    movl 72(%ebp), %eax
-; i686-NEXT:    movl %edx, 28(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 24(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %esi, 16(%eax)
-; i686-NEXT:    movl %ebx, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    movl %esi, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    popl %ebx
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    retl
 ;
@@ -905,24 +851,19 @@ define i128 @lshr_shl_mask(i128 %a0) {
 ; i686-NEXT:    .cfi_offset %ebp, -8
 ; i686-NEXT:    movl %esp, %ebp
 ; i686-NEXT:    .cfi_def_cfa_register %ebp
-; i686-NEXT:    pushl %edi
-; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    .cfi_offset %esi, -16
-; i686-NEXT:    .cfi_offset %edi, -12
+; i686-NEXT:    subl $16, %esp
+; i686-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; i686-NEXT:    andl 36(%ebp), %ecx
 ; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl 32(%ebp), %ecx
+; i686-NEXT:    movl %ecx, 8(%eax)
 ; i686-NEXT:    movl 24(%ebp), %ecx
 ; i686-NEXT:    movl 28(%ebp), %edx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT:    andl 36(%ebp), %edi
-; i686-NEXT:    movl %edi, 12(%eax)
-; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    leal -8(%ebp), %esp
-; i686-NEXT:    popl %esi
-; i686-NEXT:    popl %edi
+; i686-NEXT:    movl %ebp, %esp
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    .cfi_def_cfa %esp, 4
 ; i686-NEXT:    retl $4
@@ -948,33 +889,30 @@ define i128 @shift_i128_limited_shamt(i128 noundef %a, i32 noundef %b) nounwind
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
 ; i686-NEXT:    subl $16, %esp
-; i686-NEXT:    movl 32(%ebp), %ebx
-; i686-NEXT:    movl 28(%ebp), %edi
-; i686-NEXT:    movzbl 40(%ebp), %ecx
+; i686-NEXT:    movb 40(%ebp), %ch
 ; i686-NEXT:    movb $6, %dl
-; i686-NEXT:    subb %cl, %dl
-; i686-NEXT:    addb $-7, %cl
-; i686-NEXT:    movl %edi, %eax
-; i686-NEXT:    shrl %eax
-; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    subb %ch, %dl
+; i686-NEXT:    movl 32(%ebp), %esi
+; i686-NEXT:    movl 36(%ebp), %edi
+; i686-NEXT:    movb %dl, %cl
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    addb $-7, %ch
+; i686-NEXT:    movl 28(%ebp), %edi
+; i686-NEXT:    movl %edi, %ebx
+; i686-NEXT:    shrl %ebx
+; i686-NEXT:    movb %ch, %cl
+; i686-NEXT:    shrl %cl, %ebx
 ; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    orl %eax, %ebx
+; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    orl %ebx, %esi
+; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl 24(%ebp), %esi
-; i686-NEXT:    movl %esi, %eax
-; i686-NEXT:    shll %cl, %eax
 ; i686-NEXT:    shldl %cl, %esi, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 8(%ebp), %edi
-; i686-NEXT:    movl 36(%ebp), %esi
-; i686-NEXT:    movl 32(%ebp), %edx
-; i686-NEXT:    shldl %cl, %edx, %esi
-; i686-NEXT:    movl %esi, 12(%edi)
-; i686-NEXT:    movl %ebx, 8(%edi)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%edi)
-; i686-NEXT:    movl %eax, (%edi)
-; i686-NEXT:    movl %edi, %eax
+; i686-NEXT:    movl %edi, 4(%eax)
+; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    movl %esi, (%eax)
 ; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
@@ -1003,51 +941,45 @@ define i128 @shift_i128_limited_shamt_no_nuw(i128 noundef %a, i32 noundef %b) no
 ; i686:       # %bb.0: # %start
 ; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    movl %esp, %ebp
-; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $48, %esp
-; i686-NEXT:    movzbl 40(%ebp), %eax
-; i686-NEXT:    movl 24(%ebp), %ecx
-; i686-NEXT:    movl 28(%ebp), %edx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl 36(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    subl $32, %esp
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %ecx
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movb $6, %cl
-; i686-NEXT:    subb %al, %cl
+; i686-NEXT:    subb 40(%ebp), %cl
 ; i686-NEXT:    movl %ecx, %eax
 ; i686-NEXT:    shrb $3, %al
 ; i686-NEXT:    andb $12, %al
 ; i686-NEXT:    negb %al
-; i686-NEXT:    movsbl %al, %eax
+; i686-NEXT:    movsbl %al, %edx
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, (%esp)
-; i686-NEXT:    movl 20(%esp,%eax), %edx
-; i686-NEXT:    movl 24(%esp,%eax), %ebx
-; i686-NEXT:    movl %ebx, %edi
-; i686-NEXT:    shldl %cl, %edx, %edi
-; i686-NEXT:    movl 16(%esp,%eax), %esi
-; i686-NEXT:    movl 28(%esp,%eax), %eax
-; i686-NEXT:    shldl %cl, %ebx, %eax
-; i686-NEXT:    movl 8(%ebp), %ebx
-; i686-NEXT:    movl %eax, 12(%ebx)
-; i686-NEXT:    movl %edi, 8(%ebx)
-; i686-NEXT:    movl %esi, %eax
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    shldl %cl, %esi, %edx
-; i686-NEXT:    movl %edx, 4(%ebx)
-; i686-NEXT:    movl %eax, (%ebx)
-; i686-NEXT:    movl %ebx, %eax
-; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    movl 24(%esp,%edx), %esi
+; i686-NEXT:    movl 28(%esp,%edx), %edi
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl %edi, 12(%eax)
+; i686-NEXT:    movl 16(%esp,%edx), %edi
+; i686-NEXT:    movl 20(%esp,%edx), %edx
+; i686-NEXT:    shldl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 8(%eax)
+; i686-NEXT:    shldl %cl, %edi, %edx
+; i686-NEXT:    movl %edx, 4(%eax)
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    movl %edi, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    popl %ebx
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    retl $4
 ;
@@ -1075,51 +1007,46 @@ define i128 @shift_i128_limited_shamt_unknown_lhs(i128 noundef %a, i32 noundef %
 ; i686:       # %bb.0: # %start
 ; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    movl %esp, %ebp
-; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    subl $32, %esp
+; i686-NEXT:    movl 32(%ebp), %eax
+; i686-NEXT:    movl 36(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 24(%ebp), %eax
-; i686-NEXT:    movl 28(%ebp), %edx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl 36(%ebp), %edi
-; i686-NEXT:    movl 44(%ebp), %ecx
-; i686-NEXT:    subl 40(%ebp), %ecx
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 28(%ebp), %ecx
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, (%esp)
+; i686-NEXT:    movl 44(%ebp), %ecx
+; i686-NEXT:    subl 40(%ebp), %ecx
 ; i686-NEXT:    movl %ecx, %eax
 ; i686-NEXT:    shrb $3, %al
 ; i686-NEXT:    andb $12, %al
 ; i686-NEXT:    negb %al
-; i686-NEXT:    movsbl %al, %eax
-; i686-NEXT:    movl 20(%esp,%eax), %edx
-; i686-NEXT:    movl 24(%esp,%eax), %ebx
-; i686-NEXT:    movl %ebx, %edi
-; i686-NEXT:    shldl %cl, %edx, %edi
-; i686-NEXT:    movl 16(%esp,%eax), %esi
-; i686-NEXT:    movl 28(%esp,%eax), %eax
-; i686-NEXT:    shldl %cl, %ebx, %eax
-; i686-NEXT:    movl 8(%ebp), %ebx
-; i686-NEXT:    movl %eax, 12(%ebx)
-; i686-NEXT:    movl %edi, 8(%ebx)
-; i686-NEXT:    movl %esi, %eax
-; i686-NEXT:    shll %cl, %eax
+; i686-NEXT:    movsbl %al, %edi
+; i686-NEXT:    movl 24(%esp,%edi), %edx
+; i686-NEXT:    movl 28(%esp,%edi), %esi
+; i686-NEXT:    shldl %cl, %edx, %esi
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl %esi, 12(%eax)
+; i686-NEXT:    movl 16(%esp,%edi), %esi
+; i686-NEXT:    movl 20(%esp,%edi), %edi
+; i686-NEXT:    shldl %cl, %edi, %edx
+; i686-NEXT:    movl %edx, 8(%eax)
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, 4(%eax)
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %esi, %edx
-; i686-NEXT:    movl %edx, 4(%ebx)
-; i686-NEXT:    movl %eax, (%ebx)
-; i686-NEXT:    movl %ebx, %eax
-; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    movl %esi, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    popl %ebx
 ; i686-NEXT:    popl %ebp
 ; i686-NEXT:    retl $4
 ;
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 3b8e766ae1bf4..13fd725151399 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -171,23 +171,22 @@ define i256 @shl_i256(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -196,54 +195,39 @@ define i256 @shl_i256(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $28, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl 56(%esp,%edx), %esi
+; X86-NEXT:    movl 60(%esp,%edx), %edi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 52(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    movl 48(%esp,%edx), %esi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl 44(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl 40(%esp,%edx), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl 32(%esp,%edx), %esi
+; X86-NEXT:    movl 36(%esp,%edx), %edx
 ; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 64(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -407,24 +391,23 @@ define i256 @lshr_i256(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -433,45 +416,43 @@ define i256 @lshr_i256(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
-; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -646,72 +627,69 @@ define i256 @ashr_i256(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
-; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    sarl %cl, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -881,30 +859,25 @@ define i256 @shl_i256_load(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ecx), %esi
-; X86-NEXT:    movl 12(%ecx), %edi
-; X86-NEXT:    movl 16(%ecx), %ebx
+; X86-NEXT:    movl 28(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 20(%ecx), %edx
-; X86-NEXT:    movl 24(%ecx), %eax
-; X86-NEXT:    movl 28(%ecx), %ecx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -912,53 +885,38 @@ define i256 @shl_i256_load(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $28, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $3, %dl
+; X86-NEXT:    andb $28, %dl
+; X86-NEXT:    negb %dl
+; X86-NEXT:    movsbl %dl, %edx
+; X86-NEXT:    movl 56(%esp,%edx), %esi
+; X86-NEXT:    movl 60(%esp,%edx), %edi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 52(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    movl 48(%esp,%edx), %esi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl 44(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl 40(%esp,%edx), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl 32(%esp,%edx), %esi
+; X86-NEXT:    movl 36(%esp,%edx), %edx
 ; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 64(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1116,29 +1074,25 @@ define i256 @lshr_i256_load(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ecx), %esi
-; X86-NEXT:    movl 12(%ecx), %edi
-; X86-NEXT:    movl 16(%ecx), %ebx
+; X86-NEXT:    movl 28(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 20(%ecx), %edx
-; X86-NEXT:    movl 24(%ecx), %eax
-; X86-NEXT:    movl 28(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1147,45 +1101,42 @@ define i256 @lshr_i256_load(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $5, %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1372,77 +1323,70 @@ define i256 @ashr_i256_load(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %edi
-; X86-NEXT:    movl 12(%eax), %esi
-; X86-NEXT:    movl 16(%eax), %ebx
-; X86-NEXT:    movl 20(%eax), %edx
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl 28(%eax), %eax
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 24(%ecx), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl 20(%ecx), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl 16(%ecx), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
+; X86-NEXT:    movl 12(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    movl 28(%ecx), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrb $5, %dl
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl %cl, %edi
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    sarl %cl, %edx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1470,31 +1414,33 @@ define i256 @shl_i256_1(i256 %a0) nounwind {
 ;
 ; X86-LABEL: shl_i256_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $1, %esi, %ecx
 ; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, 20(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    movl %edx, 20(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $1, %edx, %ecx
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl %edx, 16(%eax)
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $1, %esi, %ecx
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    shldl $1, %edx, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
   %r = shl i256 %a0, 1
   ret i256 %r
@@ -1516,49 +1462,34 @@ define i256 @lshr_i256_1(i256 %a0) nounwind {
 ;
 ; X86-LABEL: lshr_i256_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shldl $31, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl $1, %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $31, %eax, %ebp
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl $31, %eax, %esi
-; X86-NEXT:    shrdl $1, %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrdl $1, %eax, %ebx
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    shldl $31, %ecx, %edx
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $31, %esi, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    shldl $31, %edx, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %ebx, 24(%ecx)
-; X86-NEXT:    movl %esi, 20(%ecx)
-; X86-NEXT:    movl %edx, 16(%ecx)
-; X86-NEXT:    movl %ebp, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $31, %esi, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    shldl $31, %ecx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $31, %esi, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    shrdl $1, %esi, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = lshr i256 %a0, 1
   ret i256 %r
@@ -1580,49 +1511,34 @@ define i256 @ashr_i256_1(i256 %a0) nounwind {
 ;
 ; X86-LABEL: ashr_i256_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shldl $31, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl $1, %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shldl $31, %eax, %ebp
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl $31, %eax, %esi
-; X86-NEXT:    shrdl $1, %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sarl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrdl $1, %eax, %ebx
-; X86-NEXT:    sarl %eax
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    shldl $31, %ecx, %edx
+; X86-NEXT:    movl %edx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $31, %esi, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    shldl $31, %edx, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %ebx, 24(%ecx)
-; X86-NEXT:    movl %esi, 20(%ecx)
-; X86-NEXT:    movl %edx, 16(%ecx)
-; X86-NEXT:    movl %ebp, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $31, %esi, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    shldl $31, %ecx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shldl $31, %esi, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    shrdl $1, %esi, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = ashr i256 %a0, 1
   ret i256 %r
@@ -1664,8 +1580,8 @@ define i256 @shl_i256_200(i256 %a0) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shll $8, %ecx
 ; X86-NEXT:    movl %edx, 28(%eax)
+; X86-NEXT:    shll $8, %ecx
 ; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl $0, 20(%eax)
 ; X86-NEXT:    movl $0, 16(%eax)
@@ -1711,10 +1627,10 @@ define i256 @lshr_i256_200(i256 %a0) nounwind {
 ;
 ; X86-LABEL: lshr_i256_200:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shrl $8, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 28(%eax)
@@ -1744,19 +1660,19 @@ define i256 @ashr_i256_200(i256 %a0) nounwind {
 ; X86-LABEL: ashr_i256_200:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $8, %esi
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %edx, 20(%eax)
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    movl %esi, 20(%eax)
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    sarl $8, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl $4
@@ -1797,9 +1713,9 @@ define i256 @shl_i256_255(i256 %a0) nounwind {
 ;
 ; X86-LABEL: shl_i256_255:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl $0, 24(%eax)
 ; X86-NEXT:    movl $0, 20(%eax)
@@ -1846,9 +1762,9 @@ define i256 @lshr_i256_255(i256 %a0) nounwind {
 ;
 ; X86-LABEL: lshr_i256_255:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 28(%eax)
 ; X86-NEXT:    movl $0, 24(%eax)
@@ -1875,9 +1791,9 @@ define i256 @ashr_i256_255(i256 %a0) nounwind {
 ;
 ; X86-LABEL: ashr_i256_255:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl %ecx, 20(%eax)
@@ -2047,8 +1963,7 @@ define i256 @shl_1_i256(i256 %a0) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -2064,54 +1979,39 @@ define i256 @shl_1_i256(i256 %a0) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $28, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl 56(%esp,%edx), %esi
+; X86-NEXT:    movl 60(%esp,%edx), %edi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 52(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    movl 48(%esp,%edx), %esi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl 44(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl 40(%esp,%edx), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl 32(%esp,%edx), %esi
+; X86-NEXT:    movl 36(%esp,%edx), %edx
 ; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 64(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2266,8 +2166,7 @@ define i256 @lshr_signbit_i256(i256 %a0) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -2283,46 +2182,44 @@ define i256 @lshr_signbit_i256(i256 %a0) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
-; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2480,8 +2377,7 @@ define i256 @ashr_signbit_i256(i256 %a0) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
@@ -2497,46 +2393,44 @@ define i256 @ashr_signbit_i256(i256 %a0) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
-; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    sarl %cl, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2705,8 +2599,7 @@ define i256 @shl_allbits_i256(i256 %a0) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
@@ -2722,54 +2615,39 @@ define i256 @shl_allbits_i256(i256 %a0) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $28, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movsbl %al, %edx
+; X86-NEXT:    movl 56(%esp,%edx), %esi
+; X86-NEXT:    movl 60(%esp,%edx), %edi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 52(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 24(%eax)
+; X86-NEXT:    movl 48(%esp,%edx), %esi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl 44(%esp,%edx), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl 40(%esp,%edx), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl 32(%esp,%edx), %esi
+; X86-NEXT:    movl 36(%esp,%edx), %edx
 ; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 64(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl %edx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -2925,8 +2803,7 @@ define i256 @lshr_allbits_i256(i256 %a0) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -2942,46 +2819,44 @@ define i256 @lshr_allbits_i256(i256 %a0) nounwind {
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $-1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $-1, (%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax,4), %edx
-; X86-NEXT:    movl 36(%esp,%eax,4), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax,4), %ebx
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esp,%eax,4), %edi
-; X86-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NEXT:    movl 56(%esp,%eax,4), %esi
-; X86-NEXT:    shrdl %cl, %esi, %edi
-; X86-NEXT:    movl 32(%esp,%eax,4), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esp,%eax,4), %edx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 28(%eax)
-; X86-NEXT:    movl %esi, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
+; X86-NEXT:    movl %edi, 28(%eax)
+; X86-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 24(%eax)
+; X86-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 20(%eax)
+; X86-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl (%esp,%edx,4), %esi
+; X86-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -3090,22 +2965,21 @@ define i64 @lshr_extract_i256_i64(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -3115,6 +2989,7 @@ define i64 @lshr_extract_i256_i64(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
 ; X86-NEXT:    movzbl %al, %edx
@@ -3164,32 +3039,32 @@ define i64 @ashr_extract_i256_i64(i256 %a0, i256 %a1) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, (%esp)
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
 ; X86-NEXT:    movzbl %al, %edx
@@ -3301,33 +3176,27 @@ define i64 @lshr_extract_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ecx), %esi
-; X86-NEXT:    movl 12(%ecx), %edi
-; X86-NEXT:    movl 16(%ecx), %ebx
-; X86-NEXT:    movl 20(%ecx), %edx
-; X86-NEXT:    movl 24(%ecx), %eax
-; X86-NEXT:    movl 28(%ecx), %ecx
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 28(%eax), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 24(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, (%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -3336,20 +3205,20 @@ define i64 @lshr_extract_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
 ; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl 24(%esp,%edx,4), %esi
-; X86-NEXT:    movl 16(%esp,%edx,4), %eax
-; X86-NEXT:    movl 20(%esp,%edx,4), %edi
+; X86-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-NEXT:    movl (%esp,%edx,4), %eax
+; X86-NEXT:    movl 4(%esp,%edx,4), %edi
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %edi, %eax
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a0 = load i256, ptr %p0
@@ -3432,33 +3301,27 @@ define i64 @ashr_extract_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %edi
-; X86-NEXT:    movl 12(%eax), %esi
-; X86-NEXT:    movl 16(%eax), %ebx
-; X86-NEXT:    movl 20(%eax), %edx
 ; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl 28(%eax), %eax
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl 28(%eax), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3468,20 +3331,20 @@ define i64 @ashr_extract_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $5, %al
 ; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl 24(%esp,%edx,4), %esi
-; X86-NEXT:    movl 16(%esp,%edx,4), %eax
-; X86-NEXT:    movl 20(%esp,%edx,4), %edi
+; X86-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-NEXT:    movl (%esp,%edx,4), %eax
+; X86-NEXT:    movl 4(%esp,%edx,4), %edi
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %edi, %eax
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a0 = load i256, ptr %p0
@@ -3500,10 +3363,10 @@ define i64 @lshr_extract_idx_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86-LABEL: lshr_extract_idx_load_i256_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $3, %edx
-; X86-NEXT:    movl (%ecx,%edx,8), %eax
-; X86-NEXT:    movl 4(%ecx,%edx,8), %edx
+; X86-NEXT:    movl (%edx,%ecx,8), %eax
+; X86-NEXT:    movl 4(%edx,%ecx,8), %edx
 ; X86-NEXT:    retl
   %a0 = load i256, ptr %p0
   %m1 = mul i256 %a1, 64
@@ -3571,32 +3434,24 @@ define i64 @ashr_extract_idx_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %esi
-; X86-NEXT:    movl 12(%eax), %edi
-; X86-NEXT:    movl 16(%eax), %ebx
-; X86-NEXT:    movl 20(%eax), %edx
 ; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl 28(%eax), %eax
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl 20(%eax), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl 16(%eax), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    movl 12(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl 28(%eax), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3607,13 +3462,11 @@ define i64 @ashr_extract_idx_load_i256_i64(ptr %p0, i256 %a1) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl 12(%ebp), %ecx
 ; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    movl 16(%esp,%ecx,8), %eax
-; X86-NEXT:    movl 20(%esp,%ecx,8), %edx
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl (%esp,%ecx,8), %eax
+; X86-NEXT:    movl 4(%esp,%ecx,8), %edx
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a0 = load i256, ptr %p0
diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll
index 604f5c19b92e5..ed1925d5601e5 100644
--- a/llvm/test/CodeGen/X86/shift-mask.ll
+++ b/llvm/test/CodeGen/X86/shift-mask.ll
@@ -543,9 +543,9 @@ define i32 @test_i32_lshr_lshr_2(i32 %a0) {
 define i64 @test_i64_lshr_lshr_0(i64 %a0) {
 ; X86-LABEL: test_i64_lshr_lshr_0:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $536870911, %edx # imm = 0x1FFFFFFF
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-MASK-LABEL: test_i64_lshr_lshr_0:
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 8f344490b66b7..ebdedb4d9855c 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -17,40 +17,36 @@
 define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm1
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movd %ecx, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm2, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT:    vmovd %edx, %xmm0
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm1
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8:
@@ -103,34 +99,30 @@ entry:
 define void @mul_4xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_4xi8:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu %xmm2, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_4xi8:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovdqu %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_4xi8:
@@ -177,55 +169,49 @@ entry:
 define void @mul_8xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_8xi8:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 16(%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu %xmm0, 16(%ecx,%eax,4)
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE-NEXT:    movdqu %xmm2, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_8xi8:
 ; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    movl c, %ecx
+; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm1, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_8xi8:
 ; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    movl c, %ecx
+; X86-AVX2-NEXT:    vmovdqu %ymm0, (%ecx,%eax,4)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -290,77 +276,71 @@ entry:
 define void @mul_16xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi8:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm2
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; X86-SSE-NEXT:    pmullw %xmm3, %xmm4
+; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; X86-SSE-NEXT:    movl c, %ecx
-; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm3
-; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X86-SSE-NEXT:    pmullw %xmm4, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; X86-SSE-NEXT:    pmullw %xmm3, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    movdqu %xmm0, 48(%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm3, 32(%ecx,%eax,4)
+; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; X86-SSE-NEXT:    movdqu %xmm4, 32(%ecx,%eax,4)
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-SSE-NEXT:    pmullw %xmm2, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm4, (%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi8:
 ; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX1-NEXT:    movl c, %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    movl c, %ecx
 ; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm3, 32(%ecx,%eax,4)
 ; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
-; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vmovdqu %xmm1, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi8:
 ; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    movl c, %ecx
+; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%ecx,%eax,4)
+; X86-AVX2-NEXT:    vmovdqu %ymm1, (%ecx,%eax,4)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -447,35 +427,31 @@ entry:
 define void @mul_2xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16:
@@ -523,33 +499,29 @@ entry:
 define void @mul_4xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_4xi16:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
 ; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_4xi16:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovdqu %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_4xi16:
@@ -595,54 +567,48 @@ entry:
 define void @mul_8xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_8xi16:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm1
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu %xmm0, 16(%ecx,%eax,4)
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_8xi16:
 ; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    movl c, %ecx
+; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm1, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_8xi16:
 ; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    movl c, %ecx
+; X86-AVX2-NEXT:    vmovdqu %ymm0, (%ecx,%eax,4)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -706,76 +672,70 @@ entry:
 define void @mul_16xi16(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi16:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl c, %ecx
-; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm2
-; X86-SSE-NEXT:    movdqu 16(%esi,%eax), %xmm3
-; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
-; X86-SSE-NEXT:    movdqu 16(%edx,%eax), %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm1
+; X86-SSE-NEXT:    movdqu 16(%ecx,%eax), %xmm2
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm0
+; X86-SSE-NEXT:    movdqu 16(%ecx,%eax), %xmm3
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE-NEXT:    pmulhuw %xmm2, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm2, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE-NEXT:    pmulhuw %xmm3, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm3, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE-NEXT:    pmullw %xmm2, %xmm3
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu %xmm2, 32(%ecx,%eax,4)
 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; X86-SSE-NEXT:    movdqu %xmm1, 32(%ecx,%eax,4)
 ; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%ecx,%eax,4)
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    movdqu %xmm0, 16(%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16:
 ; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX1-NEXT:    movl c, %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    movl c, %ecx
 ; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm3, 32(%ecx,%eax,4)
 ; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
-; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vmovdqu %xmm1, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi16:
 ; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    movl c, %ecx
+; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%ecx,%eax,4)
+; X86-AVX2-NEXT:    vmovdqu %ymm1, (%ecx,%eax,4)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -861,42 +821,38 @@ entry:
 define void @mul_2xi8_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8_sext:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl c, %ecx
-; X86-SSE-NEXT:    movzwl (%esi,%eax), %esi
-; X86-SSE-NEXT:    movd %esi, %xmm0
-; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm1
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm1
 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
 ; X86-SSE-NEXT:    psrad $16, %xmm0
+; X86-SSE-NEXT:    movl c, %ecx
 ; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_sext:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT:    vmovd %edx, %xmm0
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm1
 ; X86-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_sext:
@@ -951,42 +907,38 @@ entry:
 define void @mul_2xi8_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl c, %ecx
-; X86-SSE-NEXT:    movzwl (%esi,%eax), %esi
-; X86-SSE-NEXT:    movd %esi, %xmm0
-; X86-SSE-NEXT:    movzwl (%edx,%eax), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    pmaddwd %xmm1, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-SSE-NEXT:    movd %ecx, %xmm2
+; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7]
+; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm1
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT:    vmovd %edx, %xmm0
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
+; X86-AVX-NEXT:    vmovd %ecx, %xmm1
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
@@ -1041,35 +993,31 @@ entry:
 define void @mul_2xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm1
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_sext:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_sext:
@@ -1117,40 +1065,36 @@ entry:
 define void @mul_2xi16_sext_zext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-SSE-NEXT:    psrad $16, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movq %xmm1, (%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
+; X86-SSE-NEXT:    psrad $16, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
 ; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
@@ -1203,76 +1147,70 @@ entry:
 define void @mul_16xi16_sext(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    movl c, %ecx
-; X86-SSE-NEXT:    movdqu (%esi,%eax), %xmm2
-; X86-SSE-NEXT:    movdqu 16(%esi,%eax), %xmm3
-; X86-SSE-NEXT:    movdqu (%edx,%eax), %xmm0
-; X86-SSE-NEXT:    movdqu 16(%edx,%eax), %xmm1
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm1
+; X86-SSE-NEXT:    movdqu 16(%ecx,%eax), %xmm2
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movdqu (%ecx,%eax), %xmm0
+; X86-SSE-NEXT:    movdqu 16(%ecx,%eax), %xmm3
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE-NEXT:    pmulhw %xmm2, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm2, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE-NEXT:    pmulhw %xmm3, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm3, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE-NEXT:    pmullw %xmm2, %xmm3
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movdqu %xmm2, 32(%ecx,%eax,4)
 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; X86-SSE-NEXT:    movdqu %xmm1, 32(%ecx,%eax,4)
 ; X86-SSE-NEXT:    movdqu %xmm3, 48(%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%ecx,%eax,4)
-; X86-SSE-NEXT:    movdqu %xmm2, 16(%ecx,%eax,4)
-; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
+; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-SSE-NEXT:    movdqu %xmm1, (%ecx,%eax,4)
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-SSE-NEXT:    movdqu %xmm0, 16(%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16_sext:
 ; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX1-NEXT:    movl c, %ecx
-; X86-AVX1-NEXT:    vpmovsxwd 24(%esi,%eax), %xmm0
-; X86-AVX1-NEXT:    vpmovsxwd 16(%esi,%eax), %xmm1
-; X86-AVX1-NEXT:    vpmovsxwd 8(%esi,%eax), %xmm2
-; X86-AVX1-NEXT:    vpmovsxwd (%esi,%eax), %xmm3
-; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%eax), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    vpmovsxwd 24(%ecx,%eax), %xmm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%eax), %xmm1
+; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vpmovsxwd (%ecx,%eax), %xmm1
+; X86-AVX1-NEXT:    vpmovsxwd (%edx,%eax), %xmm2
+; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vpmovsxwd 8(%ecx,%eax), %xmm2
+; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%eax), %xmm3
+; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmovsxwd 16(%ecx,%eax), %xmm3
 ; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%eax), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%eax), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpmovsxwd (%edx,%eax), %xmm4
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    movl c, %ecx
 ; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%ecx,%eax,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%ecx,%eax,4)
+; X86-AVX1-NEXT:    vmovdqu %xmm3, 32(%ecx,%eax,4)
 ; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%ecx,%eax,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%ecx,%eax,4)
-; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    vmovdqu %xmm1, (%ecx,%eax,4)
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi16_sext:
 ; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    vpmovsxwd 16(%ecx,%eax), %ymm0
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
-; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%ecx), %ymm0
-; X86-AVX2-NEXT:    vpmovsxwd (%edx,%ecx), %ymm1
-; X86-AVX2-NEXT:    vpmovsxwd 16(%eax,%ecx), %ymm2
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vpmovsxwd (%eax,%ecx), %ymm2
+; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%eax), %ymm1
+; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpmovsxwd (%ecx,%eax), %ymm1
+; X86-AVX2-NEXT:    vpmovsxwd (%edx,%eax), %ymm2
 ; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    movl c, %ecx
+; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%ecx,%eax,4)
+; X86-AVX2-NEXT:    vmovdqu %ymm1, (%ecx,%eax,4)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -1359,26 +1297,26 @@ define void @mul_2xi8_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,255,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_varconst1:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,0,255,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_varconst1:
@@ -1423,26 +1361,26 @@ define void @mul_2xi8_varconst2(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X86-SSE-NEXT:    psrad $24, %xmm0
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65408,0,127,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_varconst2:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65408,0,127,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_varconst2:
@@ -1487,26 +1425,26 @@ define void @mul_2xi8_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,256,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_varconst3:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,0,256,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_varconst3:
@@ -1551,26 +1489,26 @@ define void @mul_2xi8_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65535,0,255,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_varconst4:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65535,0,255,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_varconst4:
@@ -1615,26 +1553,26 @@ define void @mul_2xi8_varconst5(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X86-SSE-NEXT:    psrad $24, %xmm0
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65407,0,127,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_varconst5:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65407,0,127,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_varconst5:
@@ -1679,26 +1617,26 @@ define void @mul_2xi8_varconst6(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-SSE-NEXT:    movd %ecx, %xmm0
 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X86-SSE-NEXT:    psrad $24, %xmm0
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65408,0,128,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_varconst6:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
 ; X86-AVX-NEXT:    vmovd %ecx, %xmm0
 ; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [65408,0,128,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_varconst6:
@@ -1743,25 +1681,25 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst1:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65535,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_varconst1:
@@ -1805,22 +1743,22 @@ define void @mul_2xi16_varconst2(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
 ; X86-SSE-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,0,32767,0,u,u,u,u]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst2:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X86-AVX-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [32768,0,32767,0,u,u,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_varconst2:
@@ -1861,23 +1799,23 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65536,65536,u,u]
 ; X86-SSE-NEXT:    psllq $32, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst3:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65536,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_varconst3:
@@ -1919,24 +1857,24 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    psrad $16, %xmm0
 ; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,u,u]
 ; X86-SSE-NEXT:    psllq $32, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
+; X86-SSE-NEXT:    movl c, %ecx
+; X86-SSE-NEXT:    movq %xmm0, (%ecx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_varconst4:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,32768,u,u]
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
+; X86-AVX-NEXT:    movl c, %ecx
+; X86-AVX-NEXT:    vmovq %xmm0, (%ecx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_varconst4:
@@ -1980,82 +1918,87 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    subl $16, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movdqa (%esi), %xmm0
+; X86-SSE-NEXT:    pextrw $3, %xmm0, %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movzwl 16(%eax), %edx
-; X86-SSE-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SSE-NEXT:    movdqa (%eax), %xmm2
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE-NEXT:    pextrw $7, %xmm2, %eax
-; X86-SSE-NEXT:    pextrw $4, %xmm2, %esi
-; X86-SSE-NEXT:    pextrw $1, %xmm2, %edi
-; X86-SSE-NEXT:    pextrw $0, %xmm2, %ebx
-; X86-SSE-NEXT:    pextrw $3, %xmm2, %ebp
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 28(%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm3, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 24(%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 16(%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    divl 12(%ecx)
+; X86-SSE-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 20(%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X86-SSE-NEXT:    movl %edi, %eax
+; X86-SSE-NEXT:    divl (%ecx)
+; X86-SSE-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT:    pextrw $1, %xmm0, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    divl 4(%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    movl %ebx, %eax
+; X86-SSE-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE-NEXT:    pextrw $4, %xmm0, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl (%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X86-SSE-NEXT:    movl %ebp, %eax
+; X86-SSE-NEXT:    divl 16(%ecx)
+; X86-SSE-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SSE-NEXT:    pextrw $7, %xmm0, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 12(%ecx)
-; X86-SSE-NEXT:    movd %edx, %xmm3
+; X86-SSE-NEXT:    divl 28(%ecx)
+; X86-SSE-NEXT:    movl %edx, %ebx
+; X86-SSE-NEXT:    movzwl 16(%esi), %eax
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 32(%ecx)
+; X86-SSE-NEXT:    imull $8199, %edx, %edi # imm = 0x2007
+; X86-SSE-NEXT:    pxor %xmm1, %xmm1
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; X86-SSE-NEXT:    movd %xmm2, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    divl 8(%ecx)
+; X86-SSE-NEXT:    movl %edx, %ebp
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 20(%ecx)
+; X86-SSE-NEXT:    movl %edx, %esi
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    xorl %edx, %edx
+; X86-SSE-NEXT:    divl 24(%ecx)
+; X86-SSE-NEXT:    movl %edi, (%eax)
+; X86-SSE-NEXT:    movd %ebx, %xmm1
 ; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-SSE-NEXT:    movd (%esp), %xmm0 # 4-byte Folded Reload
+; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd %esi, %xmm4
+; X86-SSE-NEXT:    movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 4-byte Folded Reload
+; X86-SSE-NEXT:    # xmm5 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 4-byte Folded Reload
+; X86-SSE-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 4-byte Folded Reload
+; X86-SSE-NEXT:    # xmm6 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd %ebp, %xmm7
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
 ; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSE-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 32(%ecx)
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
-; X86-SSE-NEXT:    movl %eax, (%eax)
-; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
-; X86-SSE-NEXT:    addl $4, %esp
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-SSE-NEXT:    movdqa %xmm2, (%eax)
+; X86-SSE-NEXT:    addl $16, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -2069,61 +2012,62 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-AVX1-NEXT:    pushl %edi
 ; X86-AVX1-NEXT:    pushl %esi
 ; X86-AVX1-NEXT:    subl $16, %esp
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vmovd %xmm2, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 32(%ecx)
+; X86-AVX1-NEXT:    divl 12(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 28(%ecx)
+; X86-AVX1-NEXT:    divl 8(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 24(%ecx)
+; X86-AVX1-NEXT:    divl 4(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 20(%ecx)
+; X86-AVX1-NEXT:    divl (%ecx)
 ; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vmovd %xmm1, %eax
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 16(%ecx)
+; X86-AVX1-NEXT:    divl 28(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, %ebp
-; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 12(%ecx)
+; X86-AVX1-NEXT:    divl 24(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, %ebx
-; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 8(%ecx)
+; X86-AVX1-NEXT:    divl 20(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, %esi
-; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 4(%ecx)
+; X86-AVX1-NEXT:    divl 16(%ecx)
 ; X86-AVX1-NEXT:    movl %edx, %edi
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl (%ecx)
-; X86-AVX1-NEXT:    vmovd %edx, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    divl 32(%ecx)
+; X86-AVX1-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
+; X86-AVX1-NEXT:    movl %eax, (%eax)
+; X86-AVX1-NEXT:    vmovd %edi, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpinsrd $3, %ebp, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd (%esp), %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-AVX1-NEXT:    # imm = 0x2007
-; X86-AVX1-NEXT:    movl %eax, (%eax)
 ; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8199,8199,8199,8199]
 ; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vmovdqa %xmm1, (%eax)
+; X86-AVX1-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX1-NEXT:    addl $16, %esp
 ; X86-AVX1-NEXT:    popl %esi
@@ -2134,56 +2078,72 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ;
 ; X86-AVX2-LABEL: PR34947:
 ; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    pushl %ebp
+; X86-AVX2-NEXT:    pushl %ebx
+; X86-AVX2-NEXT:    pushl %edi
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-AVX2-NEXT:    subl $16, %esp
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %eax
+; X86-AVX2-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 20(%esi)
-; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm2, %eax
+; X86-AVX2-NEXT:    divl 12(%ecx)
+; X86-AVX2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX2-NEXT:    vpextrd $2, %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 16(%esi)
-; X86-AVX2-NEXT:    vmovd %edx, %xmm3
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %eax
+; X86-AVX2-NEXT:    divl 8(%ecx)
+; X86-AVX2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 24(%esi)
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %eax
+; X86-AVX2-NEXT:    divl 4(%ecx)
+; X86-AVX2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 28(%esi)
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-AVX2-NEXT:    divl (%ecx)
+; X86-AVX2-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; X86-AVX2-NEXT:    vpextrd $3, %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 4(%esi)
-; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm1, %eax
+; X86-AVX2-NEXT:    divl 28(%ecx)
+; X86-AVX2-NEXT:    movl %edx, %ebp
+; X86-AVX2-NEXT:    vpextrd $2, %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl (%esi)
-; X86-AVX2-NEXT:    vmovd %edx, %xmm3
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
+; X86-AVX2-NEXT:    divl 24(%ecx)
+; X86-AVX2-NEXT:    movl %edx, %ebx
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 8(%esi)
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
+; X86-AVX2-NEXT:    divl 20(%ecx)
+; X86-AVX2-NEXT:    movl %edx, %esi
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 12(%esi)
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm3, %xmm1
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    divl 16(%ecx)
+; X86-AVX2-NEXT:    movl %edx, %edi
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 32(%esi)
-; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    divl 32(%ecx)
 ; X86-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
 ; X86-AVX2-NEXT:    movl %eax, (%eax)
+; X86-AVX2-NEXT:    vmovd %edi, %xmm0
+; X86-AVX2-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpinsrd $2, %ebx, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpinsrd $3, %ebp, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovd (%esp), %xmm1 # 4-byte Folded Reload
+; X86-AVX2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX2-NEXT:    vpinsrd $1, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX2-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX2-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [8199,8199,8199,8199,8199,8199,8199,8199]
+; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
+; X86-AVX2-NEXT:    addl $16, %esp
 ; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    popl %edi
+; X86-AVX2-NEXT:    popl %ebx
+; X86-AVX2-NEXT:    popl %ebp
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/shrink_vmul_sse.ll b/llvm/test/CodeGen/X86/shrink_vmul_sse.ll
index 2812447e44417..7f550e7324fb3 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul_sse.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul_sse.ll
@@ -10,24 +10,22 @@
 define void @mul_2xi8(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %index) nounwind {
 ; CHECK-LABEL: mul_2xi8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl c, %esi
-; CHECK-NEXT:    movzbl 1(%edx,%ecx), %edi
-; CHECK-NEXT:    movzbl (%edx,%ecx), %edx
-; CHECK-NEXT:    movzbl 1(%eax,%ecx), %ebx
-; CHECK-NEXT:    imull %edi, %ebx
-; CHECK-NEXT:    movzbl (%eax,%ecx), %eax
-; CHECK-NEXT:    imull %edx, %eax
-; CHECK-NEXT:    movl %ebx, 4(%esi,%ecx,4)
-; CHECK-NEXT:    movl %eax, (%esi,%ecx,4)
+; CHECK-NEXT:    movzbl 1(%ecx,%eax), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movzbl 1(%esi,%eax), %edi
+; CHECK-NEXT:    imull %edx, %edi
+; CHECK-NEXT:    movzbl (%esi,%eax), %edx
+; CHECK-NEXT:    movzbl (%ecx,%eax), %ecx
+; CHECK-NEXT:    imull %edx, %ecx
+; CHECK-NEXT:    movl c, %edx
+; CHECK-NEXT:    movl %edi, 4(%edx,%eax,4)
+; CHECK-NEXT:    movl %ecx, (%edx,%eax,4)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
-; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    retl
 entry:
   %pre = load ptr, ptr @c
diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll
index 9f90657dc64d1..9bb0f49b77a2a 100644
--- a/llvm/test/CodeGen/X86/shuffle-blendw.ll
+++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll
@@ -15,9 +15,9 @@ define <16 x i16> @blendw_to_blendd_32(<16 x i16> %x, <16 x i16> %y, <16 x i16>
 ; X86-SSE41-NEXT:    movl %esp, %ebp
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $16, %esp
-; X86-SSE41-NEXT:    paddw 40(%ebp), %xmm1
 ; X86-SSE41-NEXT:    paddw 24(%ebp), %xmm0
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; X86-SSE41-NEXT:    paddw 40(%ebp), %xmm1
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5],mem[6,7]
 ; X86-SSE41-NEXT:    movl %ebp, %esp
 ; X86-SSE41-NEXT:    popl %ebp
@@ -139,9 +139,9 @@ define <16 x i16> @blendw_to_blendd_fail_32(<16 x i16> %x, <16 x i16> %y, <16 x
 ; X86-SSE41-NEXT:    movl %esp, %ebp
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $16, %esp
-; X86-SSE41-NEXT:    paddw 40(%ebp), %xmm1
 ; X86-SSE41-NEXT:    paddw 24(%ebp), %xmm0
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; X86-SSE41-NEXT:    paddw 40(%ebp), %xmm1
 ; X86-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5],mem[6,7]
 ; X86-SSE41-NEXT:    movl %ebp, %esp
 ; X86-SSE41-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll
index d1137cac7d365..0445d3ca760b0 100644
--- a/llvm/test/CodeGen/X86/sibcall.ll
+++ b/llvm/test/CodeGen/X86/sibcall.ll
@@ -656,9 +656,8 @@ define fastcc void @t21_sret_to_sret_more_args2(ptr noalias sret(%struct.foo) %a
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    calll f_sret at PLT
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    addl $8, %esp
diff --git a/llvm/test/CodeGen/X86/signed-truncation-check.ll b/llvm/test/CodeGen/X86/signed-truncation-check.ll
index fc0fbb206cbb3..987334f3c8cf4 100644
--- a/llvm/test/CodeGen/X86/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/X86/signed-truncation-check.ll
@@ -108,11 +108,10 @@ define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movswl %ax, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
@@ -133,11 +132,10 @@ define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsbl %al, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/sincos-stack-args.ll b/llvm/test/CodeGen/X86/sincos-stack-args.ll
index 42c05a3e7a9be..488418bfa316b 100644
--- a/llvm/test/CodeGen/X86/sincos-stack-args.ll
+++ b/llvm/test/CodeGen/X86/sincos-stack-args.ll
@@ -11,18 +11,17 @@ define double @negative_sincos_with_stores_within_call_sequence(double %a) nounw
 ; CHECK-LABEL: negative_sincos_with_stores_within_call_sequence:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subl $44, %esp
-; CHECK-NEXT:    fldl 48(%esp)
 ; CHECK-NEXT:    leal 24(%esp), %eax
 ; CHECK-NEXT:    movl %eax, 12(%esp)
 ; CHECK-NEXT:    leal 32(%esp), %eax
 ; CHECK-NEXT:    movl %eax, 8(%esp)
+; CHECK-NEXT:    fldl 48(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
 ; CHECK-NEXT:    calll sincos
 ; CHECK-NEXT:    fldl 32(%esp)
+; CHECK-NEXT:    fstpl 8(%esp)
 ; CHECK-NEXT:    fldl 24(%esp)
 ; CHECK-NEXT:    faddl {{\.?LCPI[0-9]+_[0-9]+}}
-; CHECK-NEXT:    fxch %st(1)
-; CHECK-NEXT:    fstpl 8(%esp)
 ; CHECK-NEXT:    fstpl (%esp)
 ; CHECK-NEXT:    calll g at PLT
 ; CHECK-NEXT:    addl $44, %esp
diff --git a/llvm/test/CodeGen/X86/sink-addsub-of-const.ll b/llvm/test/CodeGen/X86/sink-addsub-of-const.ll
index cc46f04b8dabe..c4c1a25918198 100644
--- a/llvm/test/CodeGen/X86/sink-addsub-of-const.ll
+++ b/llvm/test/CodeGen/X86/sink-addsub-of-const.ll
@@ -446,13 +446,21 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; sub %y, (sub C, %x)
 
 define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
-; ALL-LABEL: vec_sink_sub_from_const_to_sub:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movdqa {{.*#+}} xmm2 = [42,24,u,46]
-; ALL-NEXT:    paddd %xmm1, %xmm0
-; ALL-NEXT:    psubd %xmm0, %xmm2
-; ALL-NEXT:    movdqa %xmm2, %xmm0
-; ALL-NEXT:    ret{{[l|q]}}
+; X32-LABEL: vec_sink_sub_from_const_to_sub:
+; X32:       # %bb.0:
+; X32-NEXT:    paddd %xmm1, %xmm0
+; X32-NEXT:    movdqa {{.*#+}} xmm1 = [42,24,u,46]
+; X32-NEXT:    psubd %xmm0, %xmm1
+; X32-NEXT:    movdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: vec_sink_sub_from_const_to_sub:
+; X64:       # %bb.0:
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [42,24,u,46]
+; X64-NEXT:    paddd %xmm1, %xmm0
+; X64-NEXT:    psubd %xmm0, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = sub <4 x i32> %t0, %b
   ret <4 x i32> %r
@@ -473,3 +481,5 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
   %r = sub <4 x i32> %b, %t0
   ret <4 x i32> %r
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index e8c05f9b562d8..17027431a1b08 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -99,35 +99,68 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
 }
 
 define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
-; SLM-LABEL: test_mul_v8i32_v8i8:
-; SLM:       # %bb.0:
-; SLM-NEXT:    movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
-; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM-NEXT:    pmaddwd %xmm2, %xmm0
-; SLM-NEXT:    pmaddwd %xmm2, %xmm1
-; SLM-NEXT:    ret{{[l|q]}}
-;
-; SLOW-LABEL: test_mul_v8i32_v8i8:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLOW-NEXT:    pmaddwd %xmm2, %xmm0
-; SLOW-NEXT:    pmaddwd %xmm2, %xmm1
-; SLOW-NEXT:    ret{{[l|q]}}
+; SLM-32-LABEL: test_mul_v8i32_v8i8:
+; SLM-32:       # %bb.0:
+; SLM-32-NEXT:    movdqa {{.*#+}} xmm3 = [18778,0,18778,0,18778,0,18778,0]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-32-NEXT:    pmaddwd %xmm3, %xmm2
+; SLM-32-NEXT:    pmaddwd %xmm3, %xmm1
+; SLM-32-NEXT:    movdqa %xmm2, %xmm0
+; SLM-32-NEXT:    retl
+;
+; SLM-64-LABEL: test_mul_v8i32_v8i8:
+; SLM-64:       # %bb.0:
+; SLM-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SLM-64-NEXT:    pmaddwd %xmm2, %xmm1
+; SLM-64-NEXT:    retq
+;
+; SLOW-32-LABEL: test_mul_v8i32_v8i8:
+; SLOW-32:       # %bb.0:
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SLOW-32-NEXT:    pmaddwd %xmm3, %xmm2
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm3, %xmm1
+; SLOW-32-NEXT:    movdqa %xmm2, %xmm0
+; SLOW-32-NEXT:    retl
+;
+; SLOW-64-LABEL: test_mul_v8i32_v8i8:
+; SLOW-64:       # %bb.0:
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLOW-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SLOW-64-NEXT:    pmaddwd %xmm2, %xmm1
+; SLOW-64-NEXT:    retq
+;
+; SSE4-32-LABEL: test_mul_v8i32_v8i8:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmaddwd %xmm3, %xmm2
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm3, %xmm1
+; SSE4-32-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-32-NEXT:    retl
 ;
-; SSE4-LABEL: test_mul_v8i32_v8i8:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmaddwd %xmm2, %xmm0
-; SSE4-NEXT:    pmaddwd %xmm2, %xmm1
-; SSE4-NEXT:    ret{{[l|q]}}
+; SSE4-64-LABEL: test_mul_v8i32_v8i8:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
+; SSE4-64-NEXT:    retq
 ;
 ; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8:
 ; AVX2-SLOW32:       # %bb.0:
@@ -196,72 +229,135 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
 }
 
 define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
-; SLM-LABEL: test_mul_v16i32_v16i8:
-; SLM:       # %bb.0:
-; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLM-NEXT:    movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
-; SLM-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SLM-NEXT:    pmaddwd %xmm5, %xmm0
-; SLM-NEXT:    pmaddwd %xmm5, %xmm1
-; SLM-NEXT:    pmaddwd %xmm5, %xmm2
-; SLM-NEXT:    pmaddwd %xmm5, %xmm3
-; SLM-NEXT:    ret{{[l|q]}}
+; SLM-32-LABEL: test_mul_v16i32_v16i8:
+; SLM-32:       # %bb.0:
+; SLM-32-NEXT:    movdqa %xmm0, %xmm2
+; SLM-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm0
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm1
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm2
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm3
+; SLM-32-NEXT:    retl
+;
+; SLM-64-LABEL: test_mul_v16i32_v16i8:
+; SLM-64:       # %bb.0:
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLM-64-NEXT:    movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm0
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm1
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm2
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm3
+; SLM-64-NEXT:    retq
+;
+; SLOW-32-LABEL: test_mul_v16i32_v16i8:
+; SLOW-32:       # %bb.0:
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm4
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm1
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm2
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm3
+; SLOW-32-NEXT:    movdqa %xmm4, %xmm0
+; SLOW-32-NEXT:    retl
+;
+; SLOW-64-LABEL: test_mul_v16i32_v16i8:
+; SLOW-64:       # %bb.0:
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm0
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm1
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm2
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm3
+; SLOW-64-NEXT:    retq
+;
+; SSE4-32-LABEL: test_mul_v16i32_v16i8:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm4
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm1
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm2
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm3
+; SSE4-32-NEXT:    movdqa %xmm4, %xmm0
+; SSE4-32-NEXT:    retl
 ;
-; SLOW-LABEL: test_mul_v16i32_v16i8:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm0
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm1
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm2
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm3
-; SLOW-NEXT:    ret{{[l|q]}}
+; SSE4-64-LABEL: test_mul_v16i32_v16i8:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
+; SSE4-64-NEXT:    retq
 ;
-; SSE4-LABEL: test_mul_v16i32_v16i8:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm0
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm1
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm2
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm3
-; SSE4-NEXT:    ret{{[l|q]}}
-;
-; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
-; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    ret{{[l|q]}}
+; AVX2-SLOW32-LABEL: test_mul_v16i32_v16i8:
+; AVX2-SLOW32:       # %bb.0:
+; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
+; AVX2-SLOW32-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX2-SLOW32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW32-NEXT:    vpmaddwd %ymm3, %ymm0, %ymm1
+; AVX2-SLOW32-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-SLOW32-NEXT:    retl
+;
+; AVX2-SLOW64-LABEL: test_mul_v16i32_v16i8:
+; AVX2-SLOW64:       # %bb.0:
+; AVX2-SLOW64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
+; AVX2-SLOW64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-SLOW64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
+; AVX2-SLOW64-NEXT:    retq
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
 ; AVX2-32:       # %bb.0:
-; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
+; AVX2-32-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
-; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
+; AVX2-32-NEXT:    vpmaddwd %ymm3, %ymm0, %ymm1
+; AVX2-32-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-32-NEXT:    retl
 ;
 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
@@ -394,16 +490,27 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
 ; SLOW-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SLOW-NEXT:    ret{{[l|q]}}
 ;
-; SSE4-LABEL: test_mul_v8i32_v8i16:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pxor %xmm1, %xmm1
-; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmulld %xmm1, %xmm2
-; SSE4-NEXT:    pmulld %xmm0, %xmm1
-; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; SSE4-32-LABEL: test_mul_v8i32_v8i16:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmulld %xmm1, %xmm2
+; SSE4-32-NEXT:    pxor %xmm3, %xmm3
+; SSE4-32-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE4-32-NEXT:    pmulld %xmm0, %xmm1
+; SSE4-32-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-32-NEXT:    retl
+;
+; SSE4-64-LABEL: test_mul_v8i32_v8i16:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    pxor %xmm1, %xmm1
+; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmulld %xmm1, %xmm2
+; SSE4-64-NEXT:    pmulld %xmm0, %xmm1
+; SSE4-64-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-64-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16:
 ; AVX2-SLOW:       # %bb.0:
@@ -472,21 +579,37 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
 ; SLOW-NEXT:    movdqa %xmm4, %xmm1
 ; SLOW-NEXT:    ret{{[l|q]}}
 ;
-; SSE4-LABEL: test_mul_v16i32_v16i16:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    movdqa %xmm0, %xmm4
-; SSE4-NEXT:    pxor %xmm3, %xmm3
-; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmulld %xmm3, %xmm0
-; SSE4-NEXT:    pmulld %xmm3, %xmm4
-; SSE4-NEXT:    pmulld %xmm3, %xmm2
-; SSE4-NEXT:    pmulld %xmm1, %xmm3
-; SSE4-NEXT:    movdqa %xmm4, %xmm1
-; SSE4-NEXT:    ret{{[l|q]}}
+; SSE4-32-LABEL: test_mul_v16i32_v16i16:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmulld %xmm3, %xmm0
+; SSE4-32-NEXT:    pxor %xmm5, %xmm5
+; SSE4-32-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE4-32-NEXT:    pmulld %xmm3, %xmm4
+; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-32-NEXT:    pmulld %xmm3, %xmm2
+; SSE4-32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE4-32-NEXT:    pmulld %xmm1, %xmm3
+; SSE4-32-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-32-NEXT:    retl
+;
+; SSE4-64-LABEL: test_mul_v16i32_v16i16:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-64-NEXT:    pxor %xmm3, %xmm3
+; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmulld %xmm3, %xmm0
+; SSE4-64-NEXT:    pmulld %xmm3, %xmm4
+; SSE4-64-NEXT:    pmulld %xmm3, %xmm2
+; SSE4-64-NEXT:    pmulld %xmm1, %xmm3
+; SSE4-64-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-64-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16:
 ; AVX2-SLOW:       # %bb.0:
@@ -501,12 +624,13 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
 ; AVX2-32:       # %bb.0:
-; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-32-NEXT:    vpmulld %ymm3, %ymm1, %ymm2
+; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-32-NEXT:    vpmulld %ymm3, %ymm0, %ymm1
+; AVX2-32-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-32-NEXT:    retl
 ;
 ; AVX2-64-LABEL: test_mul_v16i32_v16i16:
@@ -619,35 +743,68 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
 }
 
 define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
-; SLM-LABEL: test_mul_v8i32_v8i8_minsize:
-; SLM:       # %bb.0:
-; SLM-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM-NEXT:    pmaddwd %xmm2, %xmm0
-; SLM-NEXT:    pmaddwd %xmm2, %xmm1
-; SLM-NEXT:    ret{{[l|q]}}
-;
-; SLOW-LABEL: test_mul_v8i32_v8i8_minsize:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLOW-NEXT:    pmaddwd %xmm2, %xmm0
-; SLOW-NEXT:    pmaddwd %xmm2, %xmm1
-; SLOW-NEXT:    ret{{[l|q]}}
+; SLM-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; SLM-32:       # %bb.0:
+; SLM-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-32-NEXT:    pmaddwd %xmm3, %xmm2
+; SLM-32-NEXT:    pmaddwd %xmm3, %xmm1
+; SLM-32-NEXT:    movdqa %xmm2, %xmm0
+; SLM-32-NEXT:    retl
+;
+; SLM-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; SLM-64:       # %bb.0:
+; SLM-64-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SLM-64-NEXT:    pmaddwd %xmm2, %xmm1
+; SLM-64-NEXT:    retq
+;
+; SLOW-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; SLOW-32:       # %bb.0:
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SLOW-32-NEXT:    pmaddwd %xmm3, %xmm2
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm3, %xmm1
+; SLOW-32-NEXT:    movdqa %xmm2, %xmm0
+; SLOW-32-NEXT:    retl
+;
+; SLOW-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; SLOW-64:       # %bb.0:
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLOW-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SLOW-64-NEXT:    pmaddwd %xmm2, %xmm1
+; SLOW-64-NEXT:    retq
+;
+; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmaddwd %xmm3, %xmm2
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm3, %xmm1
+; SSE4-32-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-32-NEXT:    retl
 ;
-; SSE4-LABEL: test_mul_v8i32_v8i8_minsize:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmaddwd %xmm2, %xmm0
-; SSE4-NEXT:    pmaddwd %xmm2, %xmm1
-; SSE4-NEXT:    ret{{[l|q]}}
+; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
+; SSE4-64-NEXT:    retq
 ;
 ; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
 ; AVX2-SLOW32:       # %bb.0:
@@ -716,72 +873,135 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
 }
 
 define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
-; SLM-LABEL: test_mul_v16i32_v16i8_minsize:
-; SLM:       # %bb.0:
-; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLM-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
-; SLM-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLM-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SLM-NEXT:    pmaddwd %xmm5, %xmm0
-; SLM-NEXT:    pmaddwd %xmm5, %xmm1
-; SLM-NEXT:    pmaddwd %xmm5, %xmm2
-; SLM-NEXT:    pmaddwd %xmm5, %xmm3
-; SLM-NEXT:    ret{{[l|q]}}
+; SLM-32-LABEL: test_mul_v16i32_v16i8_minsize:
+; SLM-32:       # %bb.0:
+; SLM-32-NEXT:    movdqa %xmm0, %xmm2
+; SLM-32-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SLM-32-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm0
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm1
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm2
+; SLM-32-NEXT:    pmaddwd %xmm4, %xmm3
+; SLM-32-NEXT:    retl
+;
+; SLM-64-LABEL: test_mul_v16i32_v16i8_minsize:
+; SLM-64:       # %bb.0:
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLM-64-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm0
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm1
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm2
+; SLM-64-NEXT:    pmaddwd %xmm5, %xmm3
+; SLM-64-NEXT:    retq
+;
+; SLOW-32-LABEL: test_mul_v16i32_v16i8_minsize:
+; SLOW-32:       # %bb.0:
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm4
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm1
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm2
+; SLOW-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SLOW-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-32-NEXT:    pmaddwd %xmm5, %xmm3
+; SLOW-32-NEXT:    movdqa %xmm4, %xmm0
+; SLOW-32-NEXT:    retl
+;
+; SLOW-64-LABEL: test_mul_v16i32_v16i8_minsize:
+; SLOW-64:       # %bb.0:
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW-64-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm0
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm1
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm2
+; SLOW-64-NEXT:    pmaddwd %xmm4, %xmm3
+; SLOW-64-NEXT:    retq
+;
+; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm4
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm1
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm2
+; SSE4-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-32-NEXT:    pmaddwd %xmm5, %xmm3
+; SSE4-32-NEXT:    movdqa %xmm4, %xmm0
+; SSE4-32-NEXT:    retl
 ;
-; SLOW-LABEL: test_mul_v16i32_v16i8_minsize:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SLOW-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm0
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm1
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm2
-; SLOW-NEXT:    pmaddwd %xmm4, %xmm3
-; SLOW-NEXT:    ret{{[l|q]}}
+; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
+; SSE4-64-NEXT:    retq
 ;
-; SSE4-LABEL: test_mul_v16i32_v16i8_minsize:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE4-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm0
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm1
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm2
-; SSE4-NEXT:    pmaddwd %xmm4, %xmm3
-; SSE4-NEXT:    ret{{[l|q]}}
-;
-; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8_minsize:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
-; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    ret{{[l|q]}}
+; AVX2-SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
+; AVX2-SLOW32:       # %bb.0:
+; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
+; AVX2-SLOW32-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX2-SLOW32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-SLOW32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW32-NEXT:    vpmaddwd %ymm3, %ymm0, %ymm1
+; AVX2-SLOW32-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-SLOW32-NEXT:    retl
+;
+; AVX2-SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
+; AVX2-SLOW64:       # %bb.0:
+; AVX2-SLOW64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-SLOW64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
+; AVX2-SLOW64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-SLOW64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
+; AVX2-SLOW64-NEXT:    retq
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
 ; AVX2-32:       # %bb.0:
-; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
+; AVX2-32-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0,18778,0]
-; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
+; AVX2-32-NEXT:    vpmaddwd %ymm3, %ymm0, %ymm1
+; AVX2-32-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-32-NEXT:    retl
 ;
 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -870,27 +1090,49 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
 ; SLM-NEXT:    movdqa %xmm2, %xmm0
 ; SLM-NEXT:    ret{{[l|q]}}
 ;
-; SLOW-LABEL: test_mul_v8i32_v8i16_minsize:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    pxor %xmm1, %xmm1
-; SLOW-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SLOW-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; SLOW-NEXT:    pmulld %xmm1, %xmm2
-; SLOW-NEXT:    pmulld %xmm0, %xmm1
-; SLOW-NEXT:    movdqa %xmm2, %xmm0
-; SLOW-NEXT:    ret{{[l|q]}}
+; SLOW-32-LABEL: test_mul_v8i32_v8i16_minsize:
+; SLOW-32:       # %bb.0:
+; SLOW-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLOW-32-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; SLOW-32-NEXT:    pmulld %xmm1, %xmm2
+; SLOW-32-NEXT:    pxor %xmm3, %xmm3
+; SLOW-32-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SLOW-32-NEXT:    pmulld %xmm0, %xmm1
+; SLOW-32-NEXT:    movdqa %xmm2, %xmm0
+; SLOW-32-NEXT:    retl
+;
+; SLOW-64-LABEL: test_mul_v8i32_v8i16_minsize:
+; SLOW-64:       # %bb.0:
+; SLOW-64-NEXT:    pxor %xmm1, %xmm1
+; SLOW-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLOW-64-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SLOW-64-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; SLOW-64-NEXT:    pmulld %xmm1, %xmm2
+; SLOW-64-NEXT:    pmulld %xmm0, %xmm1
+; SLOW-64-NEXT:    movdqa %xmm2, %xmm0
+; SLOW-64-NEXT:    retq
+;
+; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmulld %xmm1, %xmm2
+; SSE4-32-NEXT:    pxor %xmm3, %xmm3
+; SSE4-32-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE4-32-NEXT:    pmulld %xmm0, %xmm1
+; SSE4-32-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-32-NEXT:    retl
 ;
-; SSE4-LABEL: test_mul_v8i32_v8i16_minsize:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pxor %xmm1, %xmm1
-; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmulld %xmm1, %xmm2
-; SSE4-NEXT:    pmulld %xmm0, %xmm1
-; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    ret{{[l|q]}}
+; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    pxor %xmm1, %xmm1
+; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmulld %xmm1, %xmm2
+; SSE4-64-NEXT:    pmulld %xmm0, %xmm1
+; SSE4-64-NEXT:    movdqa %xmm2, %xmm0
+; SSE4-64-NEXT:    retq
 ;
 ; AVX2-LABEL: test_mul_v8i32_v8i16_minsize:
 ; AVX2:       # %bb.0:
@@ -920,56 +1162,100 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
 ; SLM-NEXT:    movdqa %xmm4, %xmm1
 ; SLM-NEXT:    ret{{[l|q]}}
 ;
-; SLOW-LABEL: test_mul_v16i32_v16i16_minsize:
-; SLOW:       # %bb.0:
-; SLOW-NEXT:    movdqa %xmm0, %xmm4
-; SLOW-NEXT:    pxor %xmm3, %xmm3
-; SLOW-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLOW-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SLOW-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SLOW-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SLOW-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
-; SLOW-NEXT:    pmulld %xmm3, %xmm0
-; SLOW-NEXT:    pmulld %xmm3, %xmm4
-; SLOW-NEXT:    pmulld %xmm3, %xmm2
-; SLOW-NEXT:    pmulld %xmm1, %xmm3
-; SLOW-NEXT:    movdqa %xmm4, %xmm1
-; SLOW-NEXT:    ret{{[l|q]}}
+; SLOW-32-LABEL: test_mul_v16i32_v16i16_minsize:
+; SLOW-32:       # %bb.0:
+; SLOW-32-NEXT:    movdqa %xmm0, %xmm4
+; SLOW-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLOW-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SLOW-32-NEXT:    pmulld %xmm3, %xmm0
+; SLOW-32-NEXT:    pxor %xmm5, %xmm5
+; SLOW-32-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SLOW-32-NEXT:    pmulld %xmm3, %xmm4
+; SLOW-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SLOW-32-NEXT:    pmulld %xmm3, %xmm2
+; SLOW-32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SLOW-32-NEXT:    pmulld %xmm1, %xmm3
+; SLOW-32-NEXT:    movdqa %xmm4, %xmm1
+; SLOW-32-NEXT:    retl
+;
+; SLOW-64-LABEL: test_mul_v16i32_v16i16_minsize:
+; SLOW-64:       # %bb.0:
+; SLOW-64-NEXT:    movdqa %xmm0, %xmm4
+; SLOW-64-NEXT:    pxor %xmm3, %xmm3
+; SLOW-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SLOW-64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SLOW-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SLOW-64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SLOW-64-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SLOW-64-NEXT:    pmulld %xmm3, %xmm0
+; SLOW-64-NEXT:    pmulld %xmm3, %xmm4
+; SLOW-64-NEXT:    pmulld %xmm3, %xmm2
+; SLOW-64-NEXT:    pmulld %xmm1, %xmm3
+; SLOW-64-NEXT:    movdqa %xmm4, %xmm1
+; SLOW-64-NEXT:    retq
+;
+; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
+; SSE4-32:       # %bb.0:
+; SSE4-32-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-32-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SSE4-32-NEXT:    pmulld %xmm3, %xmm0
+; SSE4-32-NEXT:    pxor %xmm5, %xmm5
+; SSE4-32-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE4-32-NEXT:    pmulld %xmm3, %xmm4
+; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-32-NEXT:    pmulld %xmm3, %xmm2
+; SSE4-32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE4-32-NEXT:    pmulld %xmm1, %xmm3
+; SSE4-32-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-32-NEXT:    retl
 ;
-; SSE4-LABEL: test_mul_v16i32_v16i16_minsize:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    movdqa %xmm0, %xmm4
-; SSE4-NEXT:    pxor %xmm3, %xmm3
-; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE4-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE4-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE4-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
-; SSE4-NEXT:    pmulld %xmm3, %xmm0
-; SSE4-NEXT:    pmulld %xmm3, %xmm4
-; SSE4-NEXT:    pmulld %xmm3, %xmm2
-; SSE4-NEXT:    pmulld %xmm1, %xmm3
-; SSE4-NEXT:    movdqa %xmm4, %xmm1
-; SSE4-NEXT:    ret{{[l|q]}}
-;
-; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16_minsize:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-SLOW-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    ret{{[l|q]}}
+; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
+; SSE4-64:       # %bb.0:
+; SSE4-64-NEXT:    movdqa %xmm0, %xmm4
+; SSE4-64-NEXT:    pxor %xmm3, %xmm3
+; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE4-64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE4-64-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE4-64-NEXT:    pmovsxwd {{.*#+}} xmm3 = [18778,18778,18778,18778]
+; SSE4-64-NEXT:    pmulld %xmm3, %xmm0
+; SSE4-64-NEXT:    pmulld %xmm3, %xmm4
+; SSE4-64-NEXT:    pmulld %xmm3, %xmm2
+; SSE4-64-NEXT:    pmulld %xmm1, %xmm3
+; SSE4-64-NEXT:    movdqa %xmm4, %xmm1
+; SSE4-64-NEXT:    retq
+;
+; AVX2-SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
+; AVX2-SLOW32:       # %bb.0:
+; AVX2-SLOW32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-SLOW32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW32-NEXT:    vpmulld %ymm3, %ymm1, %ymm2
+; AVX2-SLOW32-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-SLOW32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-SLOW32-NEXT:    vpmulld %ymm3, %ymm0, %ymm1
+; AVX2-SLOW32-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX2-SLOW32-NEXT:    retl
+;
+; AVX2-SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
+; AVX2-SLOW64:       # %bb.0:
+; AVX2-SLOW64-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-SLOW64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-SLOW64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; AVX2-SLOW64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-SLOW64-NEXT:    retq
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
 ; AVX2-32:       # %bb.0:
-; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-32-NEXT:    vpmulld %ymm3, %ymm1, %ymm2
+; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-32-NEXT:    vpmulld %ymm3, %ymm0, %ymm1
+; AVX2-32-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX2-32-NEXT:    retl
 ;
 ; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
@@ -998,7 +1284,4 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
   ret <16 x i32> %m
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SLM-32: {{.*}}
-; SLM-64: {{.*}}
-; SLOW-32: {{.*}}
-; SLOW-64: {{.*}}
+; SSE4: {{.*}}
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index a215b60055dd5..379c76a1abad0 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; Intel chips with slow unaligned memory accesses
 
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
@@ -72,75 +73,43 @@ define void @store_zeros(ptr %a) {
 ; SLOW-SCALAR-LABEL: store_zeros:
 ; SLOW-SCALAR:       # %bb.0:
 ; SLOW-SCALAR-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NEXT:    movl $0
-; SLOW-SCALAR-NOT:     movl
-;
-; SLOW-SSE-LABEL: store_zeros:
-; SLOW-SSE:       # %bb.0:
-; SLOW-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SLOW-SSE-NEXT:    xorps %xmm0, %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NEXT:    movsd %xmm0
-; SLOW-SSE-NOT:     movsd
-;
-; FAST-SSE-LABEL: store_zeros:
-; FAST-SSE:       # %bb.0:
-; FAST-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FAST-SSE-NEXT:    xorps %xmm0, %xmm0
-; FAST-SSE-NEXT:    movups %xmm0
-; FAST-SSE-NEXT:    movups %xmm0
-; FAST-SSE-NEXT:    movups %xmm0
-; FAST-SSE-NEXT:    movups %xmm0
-; FAST-SSE-NOT:     movups
+; SLOW-SCALAR-NEXT:    movl $0, 60(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 56(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 52(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 48(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 44(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 40(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 36(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 32(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 28(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 24(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 20(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 16(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 12(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 8(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, 4(%eax)
+; SLOW-SCALAR-NEXT:    movl $0, (%eax)
+; SLOW-SCALAR-NEXT:    retl
 ;
 ; FAST-AVX128-LABEL: store_zeros:
 ; FAST-AVX128:       # %bb.0:
-; FAST-AVX128-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FAST-AVX128-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; FAST-AVX128-NEXT:    vmovups %xmm0
-; FAST-AVX128-NEXT:    vmovups %xmm0
-; FAST-AVX128-NEXT:    vmovups %xmm0
-; FAST-AVX128-NEXT:    vmovups %xmm0
-; FAST-AVX128-NOT:     vmovups
-;
-; FAST-AVX256-LABEL: store_zeros:
-; FAST-AVX256:       # %bb.0:
-; FAST-AVX256-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FAST-AVX256-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; FAST-AVX256-NEXT:    vmovups %ymm0
-; FAST-AVX256-NEXT:    vmovups %ymm0
-; FAST-AVX256-NOT:     vmovups
-;
-; FAST-AVX512-LABEL: store_zeros:
-; FAST-AVX512:       # %bb.0:
-; FAST-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FAST-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; FAST-AVX512-NEXT:    vmovups %zmm0, (%eax)
-; FAST-AVX512-NOT:     vmovups
+; FAST-AVX128-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FAST-AVX128-NEXT:    vmovups %xmm0, 16(%eax)
+; FAST-AVX128-NEXT:    vmovups %xmm0, (%eax)
+; FAST-AVX128-NEXT:    vmovups %xmm0, 48(%eax)
+; FAST-AVX128-NEXT:    vmovups %xmm0, 32(%eax)
+; FAST-AVX128-NEXT:    retl
   call void @llvm.memset.p0.i64(ptr %a, i8 0, i64 64, i1 false)
   ret void
 }
 
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1)
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FAST: {{.*}}
+; FAST-AVX256: {{.*}}
+; FAST-AVX512: {{.*}}
+; FAST-SSE: {{.*}}
+; SLOW: {{.*}}
+; SLOW-SSE: {{.*}}
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index c12a66247ca02..965675ae6dc81 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -157,27 +157,27 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    cmpl 24(%ebp), %ebx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    cmpl 24(%ebp), %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl 28(%ebp), %eax
 ; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    cmovll 24(%ebp), %ebx
-; X86-NEXT:    cmovll 28(%ebp), %edi
-; X86-NEXT:    cmovll 32(%ebp), %edx
-; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovll %ebx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    cmovll 32(%ebp), %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovll 28(%ebp), %edi
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    cmovll 24(%ebp), %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -224,16 +224,14 @@ define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovgl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    cmovgl %esi, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    cmovgl %ecx, %edx
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %r = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %r
@@ -256,24 +254,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v3i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovgl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    cmovgl %ebx, %eax
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    cmovgl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovgl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    cmovgl %esi, %ecx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %r = call <3 x i32> @llvm.smax.v3i32(<3 x i32> %a, <3 x i32> %b)
   ret <3 x i32> %r
@@ -296,31 +290,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovgl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmovgl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovgl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmovgl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -357,62 +347,47 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebp, %eax
-; X86-NEXT:    cmovgl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    cmovgl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovgl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmovgl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovgl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovgl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %edx, 24(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %r
@@ -431,62 +406,47 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bp, %ax
-; X86-NEXT:    cmovgl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bx, %ax
-; X86-NEXT:    cmovgl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovgl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    cmovgl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovgl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpw %cx, %ax
 ; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovgl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %dx, 12(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 10(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 8(%ecx)
-; X86-NEXT:    movw %si, 6(%ecx)
-; X86-NEXT:    movw %di, 4(%ecx)
-; X86-NEXT:    movw %bx, 2(%ecx)
-; X86-NEXT:    movw %bp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, 6(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %r
@@ -509,123 +469,87 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X86-LABEL: test_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmovgl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovgl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb %cl, %al
 ; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movb %cl, 15(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmovgl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovgl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %bl, 13(%eax)
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovgl %edx, %ecx
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %r
@@ -645,11 +569,11 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $15, %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $15, %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovlel %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
   %ax = ashr i16 %a, 15
@@ -670,9 +594,9 @@ define i32 @test_signbits_i32(i32 %a, i32 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $17, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    cmovgl %ecx, %eax
 ; X86-NEXT:    retl
@@ -725,19 +649,19 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    shrdl $28, %ecx, %edx
 ; X86-NEXT:    sarl $28, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    cmpl %esi, %edx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    cmovll %esi, %edx
 ; X86-NEXT:    cmovll %eax, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    cmovll %esi, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index cdfb484be0b6c..24c730300c37a 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -158,26 +158,27 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 44(%ebp), %edx
-; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    cmpl %ecx, 24(%ebp)
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    cmovll 32(%ebp), %ebx
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    cmovll 28(%ebp), %esi
+; X86-NEXT:    movl %esi, 4(%edx)
 ; X86-NEXT:    cmovll 24(%ebp), %ecx
-; X86-NEXT:    cmovll 28(%ebp), %edx
-; X86-NEXT:    cmovll 32(%ebp), %esi
-; X86-NEXT:    cmovll %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -224,16 +225,14 @@ define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    cmovll %esi, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    cmovll %ecx, %edx
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %r = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %r
@@ -256,24 +255,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v3i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    cmovll %ebx, %eax
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    cmovll %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    cmovll %esi, %ecx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %r = call <3 x i32> @llvm.smin.v3i32(<3 x i32> %a, <3 x i32> %b)
   ret <3 x i32> %r
@@ -296,31 +291,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovll %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmovll %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -357,62 +348,47 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebp, %eax
-; X86-NEXT:    cmovll %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovll %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovll %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %edx, 24(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %r
@@ -431,62 +407,47 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bp, %ax
-; X86-NEXT:    cmovll %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bx, %ax
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovll %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpw %cx, %ax
 ; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovll %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %dx, 12(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 10(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 8(%ecx)
-; X86-NEXT:    movw %si, 6(%ecx)
-; X86-NEXT:    movw %di, 4(%ecx)
-; X86-NEXT:    movw %bx, 2(%ecx)
-; X86-NEXT:    movw %bp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, 6(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %r
@@ -509,123 +470,87 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X86-LABEL: test_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmovll %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovll %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb %cl, %al
 ; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movb %cl, 15(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmovll %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %bl, 13(%eax)
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovll %edx, %ecx
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %r
@@ -645,11 +570,11 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $15, %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $15, %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
   %ax = ashr i16 %a, 15
@@ -670,9 +595,9 @@ define i32 @test_signbits_i32(i32 %a, i32 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $17, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    retl
@@ -725,19 +650,19 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    shrdl $28, %ecx, %edx
 ; X86-NEXT:    sarl $28, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    cmovll %esi, %edx
 ; X86-NEXT:    cmovll %eax, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    cmovll %esi, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index df167338268c4..e33b734550b32 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,384 +191,395 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $108, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, 12(%eax)
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -593,7 +604,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %edi
@@ -626,67 +637,57 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    sarl $31, %ebp
 ; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    xorl %ebp, %edi
+; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    xorl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    xorl %ebp, %ebx
 ; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    xorl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl %ebp, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    xorl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    orl %edx, %ebp
 ; X86-NEXT:    orl %eax, %ebp
 ; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $1, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movb %bl, 16(%eax)
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movb %cl, 16(%eax)
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %ebp, %ebx
 ; X86-NEXT:    setne 32(%eax)
-; X86-NEXT:    addl $108, %esp
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index 8cb032776114b..886c280835df6 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -117,10 +117,10 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $4, %al
 ; X86-NEXT:    sarb $4, %al
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shlb $4, %cl
-; X86-NEXT:    sarb $4, %cl
-; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movsbl %al, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    sarb $4, %al
 ; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    imull %ecx, %eax
 ; X86-NEXT:    shlb $6, %ah
@@ -160,38 +160,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl $30, %eax, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shldl $30, %eax, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shldl $30, %eax, %ebp
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $30, %eax, %edx
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %ebp, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
   ret <4 x i32> %tmp
@@ -281,23 +267,19 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0)
   ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index e68b6e328b723..04320695b2ebd 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -29,11 +29,11 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shrdl $2, %edx, %eax
-; X86-NEXT:    cmpl $2, %edx
 ; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2, %edx
 ; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmpl $-2, %edx
 ; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    cmpl $-2, %edx
 ; X86-NEXT:    cmovll %ecx, %eax
 ; X86-NEXT:    retl
   %tmp = call i32 @llvm.smul.fix.sat.i32(i32 %x, i32 %y, i32 2)
@@ -166,22 +166,22 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $4, %al
-; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    movsbl %al, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shlb $4, %cl
+; X86-NEXT:    sarb $4, %cl
 ; X86-NEXT:    movsbl %cl, %ecx
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    imull %ecx, %eax
-; X86-NEXT:    movb %ah, %cl
-; X86-NEXT:    shlb $6, %cl
-; X86-NEXT:    shrb $2, %al
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    cmpb $2, %ah
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    shlb $6, %al
+; X86-NEXT:    shrb $2, %cl
+; X86-NEXT:    orb %al, %cl
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    movl $127, %edx
-; X86-NEXT:    cmovll %ecx, %edx
-; X86-NEXT:    cmpb $-2, %ah
+; X86-NEXT:    cmpb $2, %ch
+; X86-NEXT:    cmovll %eax, %edx
 ; X86-NEXT:    movl $128, %eax
+; X86-NEXT:    cmpb $-2, %ch
 ; X86-NEXT:    cmovgel %edx, %eax
 ; X86-NEXT:    sarb $4, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -261,55 +261,46 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shrdl $2, %edx, %ecx
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    movl $2147483647, %ebp # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovgel %ebp, %ecx
+; X86-NEXT:    cmovgel %esi, %eax
+; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
 ; X86-NEXT:    cmpl $-2, %edx
-; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
-; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    shrdl $2, %edx, %edi
+; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    cmovgel %ebp, %edi
+; X86-NEXT:    cmovgel %esi, %eax
 ; X86-NEXT:    cmpl $-2, %edx
-; X86-NEXT:    cmovll %esi, %edi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shrdl $2, %edx, %ebx
+; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    cmovgel %ebp, %ebx
+; X86-NEXT:    cmovgel %esi, %eax
 ; X86-NEXT:    cmpl $-2, %edx
-; X86-NEXT:    cmovll %esi, %ebx
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    cmovgel %ebp, %eax
+; X86-NEXT:    cmovgel %esi, %eax
 ; X86-NEXT:    cmpl $-2, %edx
-; X86-NEXT:    cmovll %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl %ebx, 8(%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
   ret <4 x i32> %tmp
@@ -375,22 +366,22 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl %edx, %esi
@@ -462,19 +453,19 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
 ;
 ; X86-LABEL: func6:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shlb $4, %cl
-; X86-NEXT:    sarb $4, %cl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $4, %al
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shlb $4, %dl
+; X86-NEXT:    sarb $4, %dl
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    xorb %cl, %ah
-; X86-NEXT:    sets %dl
-; X86-NEXT:    addl $127, %edx
-; X86-NEXT:    imulb %cl
+; X86-NEXT:    xorb %dl, %ah
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    imulb %dl
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovol %edx, %eax
+; X86-NEXT:    cmovol %ecx, %eax
 ; X86-NEXT:    sarb $4, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
@@ -539,55 +530,51 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    imull %edx, %ecx
 ; X86-NEXT:    cmovol %eax, %ecx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    imull %ebx, %edx
-; X86-NEXT:    cmovol %eax, %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    cmovol %eax, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    cmovol %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    cmovol %ecx, %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    cmovol %ecx, %esi
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    cmovol %ecx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0)
   ret <4 x i32> %tmp
@@ -739,8 +726,8 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    cmovnsl %edx, %esi
 ; X86-NEXT:    shrdl $31, %esi, %eax
 ; X86-NEXT:    shrdl $31, %ecx, %esi
-; X86-NEXT:    cmpl $1073741824, %ecx # imm = 0x40000000
 ; X86-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $1073741824, %ecx # imm = 0x40000000
 ; X86-NEXT:    cmovll %esi, %edi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %eax
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 13596e1b18768..47edc149698de 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -76,51 +76,15 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -129,154 +93,178 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    imull %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    xorl %ebx, %eax
 ; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    xorl %ebx, %edx
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    popl %esi
@@ -523,103 +511,102 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 148
+; X86-NEXT:    subl $100, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 120
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
@@ -627,307 +614,306 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -937,14 +923,14 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -955,176 +941,181 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    adcl %edx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    imull %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    adcl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
@@ -1153,134 +1144,116 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    imull %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    xorl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    orl %esi, %ebp
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %ebp, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $128, %esp
+; X86-NEXT:    addl $100, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/split-vector-bitcast.ll b/llvm/test/CodeGen/X86/split-vector-bitcast.ll
index 2039a0274496c..8f370669278d1 100644
--- a/llvm/test/CodeGen/X86/split-vector-bitcast.ll
+++ b/llvm/test/CodeGen/X86/split-vector-bitcast.ll
@@ -8,20 +8,20 @@ define void @a(ptr %a, ptr %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; CHECK-NEXT:    addps %xmm0, %xmm0
 ; CHECK-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl %eax, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    movss %xmm0, (%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%esp), %edx
-; CHECK-NEXT:    addl %edx, %edx
-; CHECK-NEXT:    addl %ecx, %ecx
-; CHECK-NEXT:    movl %ecx, (%eax)
-; CHECK-NEXT:    movl %edx, 4(%eax)
+; CHECK-NEXT:    movl (%esp), %eax
+; CHECK-NEXT:    addl %eax, %eax
+; CHECK-NEXT:    movl %eax, 4(%ecx)
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index 650b562e6cb5c..07be72bd9af54 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -115,49 +115,51 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl $1, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    andl $1, %ebp
-; X86-NEXT:    negl %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    negl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    pushl $-1
 ; X86-NEXT:    pushl $-9
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $9
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    notl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    andl $1, %ebp
+; X86-NEXT:    negl %ebp
+; X86-NEXT:    xorl $3, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %bl
+; X86-NEXT:    xorl $3, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    setne %bh
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    pushl $9
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    xorl $3, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    xorl $3, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    setne %cl
-; X86-NEXT:    xorl $-3, %ebx
-; X86-NEXT:    andl $1, %ebp
-; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    xorl $-3, %eax
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl $1, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    setne %dl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movb %bh, %cl
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll
index 2b980683cba75..8dd232eed6c61 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll
@@ -47,10 +47,10 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
 define i32 @test_optsize(i32 %X) optsize nounwind readnone {
 ; X86-LABEL: test_optsize:
 ; X86:       # %bb.0:
-; X86-NEXT:    imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD
-; X86-NEXT:    addl $429496729, %eax # imm = 0x19999999
-; X86-NEXT:    cmpl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD
+; X86-NEXT:    addl $429496729, %ecx # imm = 0x19999999
 ; X86-NEXT:    movl $42, %eax
+; X86-NEXT:    cmpl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    jb .LBB1_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl $-10, %eax
diff --git a/llvm/test/CodeGen/X86/sse-fcopysign.ll b/llvm/test/CodeGen/X86/sse-fcopysign.ll
index 3eadcad145b65..9f3f6ab1472fa 100644
--- a/llvm/test/CodeGen/X86/sse-fcopysign.ll
+++ b/llvm/test/CodeGen/X86/sse-fcopysign.ll
@@ -11,8 +11,8 @@ define float @tst1(float %a, float %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    movss %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    movss %xmm0, (%esp)
 ; X86-NEXT:    calll copysignf
 ; X86-NEXT:    addl $8, %esp
@@ -33,11 +33,11 @@ define double @tst2(double %a, float %b, float %c) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    addss {{[0-9]+}}(%esp), %xmm1
-; X86-NEXT:    cvtss2sd %xmm1, %xmm1
 ; X86-NEXT:    movsd %xmm0, (%esp)
-; X86-NEXT:    movsd %xmm1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    cvtss2sd %xmm0, %xmm0
+; X86-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll copysign
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    retl
@@ -58,8 +58,8 @@ define x86_fp80 @tst3(x86_fp80 %a, x86_fp80 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    calll copysignl
 ; X86-NEXT:    addl $24, %esp
@@ -118,14 +118,14 @@ define double @int2(double %a, float %b, float %c) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    addss 20(%ebp), %xmm0
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    cvtss2sd %xmm0, %xmm0
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-NEXT:    orps %xmm1, %xmm0
-; X86-NEXT:    movlps %xmm0, (%esp)
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    addss 20(%ebp), %xmm1
+; X86-NEXT:    cvtss2sd %xmm1, %xmm1
+; X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    orps %xmm0, %xmm1
+; X86-NEXT:    movlps %xmm1, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -150,8 +150,8 @@ define x86_fp80 @int3(x86_fp80 %a, x86_fp80 %b) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
-; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fabs
 ; X86-NEXT:    fld %st(0)
 ; X86-NEXT:    fchs
@@ -237,9 +237,9 @@ define void @PR41749() {
 ; X86-NEXT:    fldz
 ; X86-NEXT:    fld %st(0)
 ; X86-NEXT:    fstpt (%esp)
-; X86-NEXT:    testb $-128, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fld %st(0)
 ; X86-NEXT:    fchs
+; X86-NEXT:    testb $-128, {{[0-9]+}}(%esp)
 ; X86-NEXT:    fxch %st(1)
 ; X86-NEXT:    fcmovne %st(1), %st
 ; X86-NEXT:    fstp %st(1)
diff --git a/llvm/test/CodeGen/X86/sse-intel-ocl.ll b/llvm/test/CodeGen/X86/sse-intel-ocl.ll
index b2de7545ff5f5..105165bbddb61 100644
--- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll
@@ -14,16 +14,16 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-16, %esp
 ; WIN32-NEXT:    subl $80, %esp
+; WIN32-NEXT:    movl %esp, %eax
+; WIN32-NEXT:    movups 24(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm0
+; WIN32-NEXT:    movups 40(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm1
+; WIN32-NEXT:    movups 56(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm2
 ; WIN32-NEXT:    movups 72(%ebp), %xmm4
 ; WIN32-NEXT:    movups 8(%ebp), %xmm3
 ; WIN32-NEXT:    addps %xmm4, %xmm3
-; WIN32-NEXT:    movups 56(%ebp), %xmm4
-; WIN32-NEXT:    movups 40(%ebp), %xmm5
-; WIN32-NEXT:    movups 24(%ebp), %xmm6
-; WIN32-NEXT:    movl %esp, %eax
-; WIN32-NEXT:    addps %xmm6, %xmm0
-; WIN32-NEXT:    addps %xmm5, %xmm1
-; WIN32-NEXT:    addps %xmm4, %xmm2
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
@@ -92,30 +92,30 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-16, %esp
 ; WIN32-NEXT:    subl $80, %esp
-; WIN32-NEXT:    movups 72(%ebp), %xmm6
-; WIN32-NEXT:    movups 8(%ebp), %xmm3
-; WIN32-NEXT:    movups 56(%ebp), %xmm7
-; WIN32-NEXT:    movups 40(%ebp), %xmm5
-; WIN32-NEXT:    movups 24(%ebp), %xmm4
 ; WIN32-NEXT:    movl %esp, %eax
-; WIN32-NEXT:    addps %xmm4, %xmm0
-; WIN32-NEXT:    addps %xmm5, %xmm1
-; WIN32-NEXT:    addps %xmm7, %xmm2
-; WIN32-NEXT:    addps %xmm6, %xmm3
+; WIN32-NEXT:    movups 24(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm0
+; WIN32-NEXT:    movups 40(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm1
+; WIN32-NEXT:    movups 56(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm2
+; WIN32-NEXT:    movups 72(%ebp), %xmm4
+; WIN32-NEXT:    movups 8(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm4, %xmm3
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
-; WIN32-NEXT:    movups 72(%ebp), %xmm4
-; WIN32-NEXT:    addps %xmm4, %xmm3
-; WIN32-NEXT:    movups 56(%ebp), %xmm4
-; WIN32-NEXT:    addps %xmm4, %xmm2
-; WIN32-NEXT:    movups 40(%ebp), %xmm4
-; WIN32-NEXT:    addps %xmm4, %xmm1
 ; WIN32-NEXT:    movups 24(%ebp), %xmm4
 ; WIN32-NEXT:    addps %xmm4, %xmm0
 ; WIN32-NEXT:    addps (%esp), %xmm0
+; WIN32-NEXT:    movups 40(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm1
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
+; WIN32-NEXT:    movups 56(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm2
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
+; WIN32-NEXT:    movups 72(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm3
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
@@ -201,13 +201,13 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-16, %esp
 ; WIN32-NEXT:    subl $96, %esp
+; WIN32-NEXT:    movups 56(%ebp), %xmm4
+; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movups 40(%ebp), %xmm4
+; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movups 24(%ebp), %xmm4
+; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups 8(%ebp), %xmm4
-; WIN32-NEXT:    movups 24(%ebp), %xmm5
-; WIN32-NEXT:    movups 40(%ebp), %xmm6
-; WIN32-NEXT:    movups 56(%ebp), %xmm7
-; WIN32-NEXT:    movups %xmm7, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movups %xmm6, {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movups %xmm5, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups %xmm3, (%esp)
 ; WIN32-NEXT:    calll _func_float16
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 2e2e78a6da51e..9b0b684a1ed95 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -3130,43 +3130,81 @@ define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 }
 
 define void @test_MM_TRANSPOSE4_PS(ptr %a0, ptr %a1, ptr %a2, ptr %a3) nounwind {
-; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %esi # encoding: [0x56]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-SSE-NEXT:    movaps (%esi), %xmm1 # encoding: [0x0f,0x28,0x0e]
-; X86-SSE-NEXT:    movaps (%edx), %xmm2 # encoding: [0x0f,0x28,0x12]
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0 # encoding: [0x0f,0x28,0x01]
-; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
-; X86-SSE-NEXT:    movaps %xmm1, %xmm4 # encoding: [0x0f,0x28,0xe1]
-; X86-SSE-NEXT:    unpcklps %xmm2, %xmm4 # encoding: [0x0f,0x14,0xe2]
-; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X86-SSE-NEXT:    movaps %xmm0, %xmm5 # encoding: [0x0f,0x28,0xe8]
-; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
-; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X86-SSE-NEXT:    unpckhps %xmm2, %xmm1 # encoding: [0x0f,0x15,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    unpckhps %xmm3, %xmm0 # encoding: [0x0f,0x15,0xc3]
-; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; X86-SSE-NEXT:    movaps %xmm4, %xmm2 # encoding: [0x0f,0x28,0xd4]
-; X86-SSE-NEXT:    movlhps %xmm5, %xmm2 # encoding: [0x0f,0x16,0xd5]
-; X86-SSE-NEXT:    # xmm2 = xmm2[0],xmm5[0]
-; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
-; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
-; X86-SSE-NEXT:    movaps %xmm1, %xmm3 # encoding: [0x0f,0x28,0xd9]
-; X86-SSE-NEXT:    movlhps %xmm0, %xmm3 # encoding: [0x0f,0x16,0xd8]
-; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0]
-; X86-SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
-; X86-SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movaps %xmm2, (%esi) # encoding: [0x0f,0x29,0x16]
-; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
-; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
-; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
-; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
-; X86-SSE-NEXT:    retl # encoding: [0xc3]
+; X86-SSE1-LABEL: test_MM_TRANSPOSE4_PS:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %esi # encoding: [0x56]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
+; X86-SSE1-NEXT:    movaps (%edx), %xmm2 # encoding: [0x0f,0x28,0x12]
+; X86-SSE1-NEXT:    movaps (%esi), %xmm1 # encoding: [0x0f,0x28,0x0e]
+; X86-SSE1-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
+; X86-SSE1-NEXT:    unpckhps %xmm2, %xmm0 # encoding: [0x0f,0x15,0xc2]
+; X86-SSE1-NEXT:    # xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE1-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
+; X86-SSE1-NEXT:    movaps (%ecx), %xmm4 # encoding: [0x0f,0x28,0x21]
+; X86-SSE1-NEXT:    movaps %xmm4, %xmm5 # encoding: [0x0f,0x28,0xec]
+; X86-SSE1-NEXT:    unpckhps %xmm3, %xmm5 # encoding: [0x0f,0x15,0xeb]
+; X86-SSE1-NEXT:    # xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; X86-SSE1-NEXT:    movaps %xmm5, %xmm6 # encoding: [0x0f,0x28,0xf5]
+; X86-SSE1-NEXT:    movhlps %xmm0, %xmm6 # encoding: [0x0f,0x12,0xf0]
+; X86-SSE1-NEXT:    # xmm6 = xmm0[1],xmm6[1]
+; X86-SSE1-NEXT:    movlhps %xmm5, %xmm0 # encoding: [0x0f,0x16,0xc5]
+; X86-SSE1-NEXT:    # xmm0 = xmm0[0],xmm5[0]
+; X86-SSE1-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
+; X86-SSE1-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE1-NEXT:    unpcklps %xmm3, %xmm4 # encoding: [0x0f,0x14,0xe3]
+; X86-SSE1-NEXT:    # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE1-NEXT:    movaps %xmm4, %xmm2 # encoding: [0x0f,0x28,0xd4]
+; X86-SSE1-NEXT:    movhlps %xmm1, %xmm2 # encoding: [0x0f,0x12,0xd1]
+; X86-SSE1-NEXT:    # xmm2 = xmm1[1],xmm2[1]
+; X86-SSE1-NEXT:    movlhps %xmm4, %xmm1 # encoding: [0x0f,0x16,0xcc]
+; X86-SSE1-NEXT:    # xmm1 = xmm1[0],xmm4[0]
+; X86-SSE1-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
+; X86-SSE1-NEXT:    movaps %xmm2, (%edx) # encoding: [0x0f,0x29,0x12]
+; X86-SSE1-NEXT:    movaps %xmm0, (%ecx) # encoding: [0x0f,0x29,0x01]
+; X86-SSE1-NEXT:    movaps %xmm6, (%eax) # encoding: [0x0f,0x29,0x30]
+; X86-SSE1-NEXT:    popl %esi # encoding: [0x5e]
+; X86-SSE1-NEXT:    retl # encoding: [0xc3]
+;
+; X86-SSE2-LABEL: test_MM_TRANSPOSE4_PS:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi # encoding: [0x56]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
+; X86-SSE2-NEXT:    movaps (%eax), %xmm1 # encoding: [0x0f,0x28,0x08]
+; X86-SSE2-NEXT:    movaps (%ecx), %xmm0 # encoding: [0x0f,0x28,0x01]
+; X86-SSE2-NEXT:    movaps %xmm0, %xmm2 # encoding: [0x0f,0x28,0xd0]
+; X86-SSE2-NEXT:    unpckhps %xmm1, %xmm2 # encoding: [0x0f,0x15,0xd1]
+; X86-SSE2-NEXT:    # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE2-NEXT:    movaps (%edx), %xmm3 # encoding: [0x0f,0x28,0x1a]
+; X86-SSE2-NEXT:    movaps (%esi), %xmm4 # encoding: [0x0f,0x28,0x26]
+; X86-SSE2-NEXT:    movaps %xmm4, %xmm5 # encoding: [0x0f,0x28,0xec]
+; X86-SSE2-NEXT:    unpckhps %xmm3, %xmm5 # encoding: [0x0f,0x15,0xeb]
+; X86-SSE2-NEXT:    # xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; X86-SSE2-NEXT:    movaps %xmm5, %xmm6 # encoding: [0x0f,0x28,0xf5]
+; X86-SSE2-NEXT:    unpckhpd %xmm2, %xmm6 # encoding: [0x66,0x0f,0x15,0xf2]
+; X86-SSE2-NEXT:    # xmm6 = xmm6[1],xmm2[1]
+; X86-SSE2-NEXT:    movlhps %xmm2, %xmm5 # encoding: [0x0f,0x16,0xea]
+; X86-SSE2-NEXT:    # xmm5 = xmm5[0],xmm2[0]
+; X86-SSE2-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
+; X86-SSE2-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT:    unpcklps %xmm3, %xmm4 # encoding: [0x0f,0x14,0xe3]
+; X86-SSE2-NEXT:    # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE2-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
+; X86-SSE2-NEXT:    unpckhpd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x15,0xc8]
+; X86-SSE2-NEXT:    # xmm1 = xmm1[1],xmm0[1]
+; X86-SSE2-NEXT:    movlhps %xmm0, %xmm4 # encoding: [0x0f,0x16,0xe0]
+; X86-SSE2-NEXT:    # xmm4 = xmm4[0],xmm0[0]
+; X86-SSE2-NEXT:    movaps %xmm4, (%esi) # encoding: [0x0f,0x29,0x26]
+; X86-SSE2-NEXT:    movaps %xmm1, (%edx) # encoding: [0x0f,0x29,0x0a]
+; X86-SSE2-NEXT:    movaps %xmm5, (%ecx) # encoding: [0x0f,0x29,0x29]
+; X86-SSE2-NEXT:    movaps %xmm6, (%eax) # encoding: [0x0f,0x29,0x30]
+; X86-SSE2-NEXT:    popl %esi # encoding: [0x5e]
+; X86-SSE2-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
 ; X86-AVX1:       # %bb.0:
@@ -3175,30 +3213,30 @@ define void @test_MM_TRANSPOSE4_PS(ptr %a0, ptr %a1, ptr %a2, ptr %a3) nounwind
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-AVX1-NEXT:    vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06]
-; X86-AVX1-NEXT:    vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
-; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11]
-; X86-AVX1-NEXT:    vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18]
-; X86-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
-; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
-; X86-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
-; X86-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
-; X86-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
-; X86-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
-; X86-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
-; X86-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
-; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
-; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
-; X86-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
-; X86-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
-; X86-AVX1-NEXT:    vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16]
+; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00]
+; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; X86-AVX1-NEXT:    vunpckhps %xmm0, %xmm1, %xmm2 # encoding: [0xc5,0xf0,0x15,0xd0]
+; X86-AVX1-NEXT:    # xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-AVX1-NEXT:    vmovaps (%edx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x1a]
+; X86-AVX1-NEXT:    vmovaps (%esi), %xmm4 # encoding: [0xc5,0xf8,0x28,0x26]
+; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm4, %xmm5 # encoding: [0xc5,0xd8,0x15,0xeb]
+; X86-AVX1-NEXT:    # xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X86-AVX1-NEXT:    vunpckhpd %xmm2, %xmm5, %xmm6 # encoding: [0xc5,0xd1,0x15,0xf2]
+; X86-AVX1-NEXT:    # xmm6 = xmm5[1],xmm2[1]
+; X86-AVX1-NEXT:    vmovlhps %xmm2, %xmm5, %xmm2 # encoding: [0xc5,0xd0,0x16,0xd2]
+; X86-AVX1-NEXT:    # xmm2 = xmm5[0],xmm2[0]
+; X86-AVX1-NEXT:    vunpcklps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0x14,0xc0]
+; X86-AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm4, %xmm1 # encoding: [0xc5,0xd8,0x14,0xcb]
+; X86-AVX1-NEXT:    # xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-AVX1-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm3 # encoding: [0xc5,0xf1,0x15,0xd8]
+; X86-AVX1-NEXT:    # xmm3 = xmm1[1],xmm0[1]
+; X86-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0x16,0xc0]
+; X86-AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X86-AVX1-NEXT:    vmovaps %xmm0, (%esi) # encoding: [0xc5,0xf8,0x29,0x06]
 ; X86-AVX1-NEXT:    vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a]
-; X86-AVX1-NEXT:    vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21]
-; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
+; X86-AVX1-NEXT:    vmovaps %xmm2, (%ecx) # encoding: [0xc5,0xf8,0x29,0x11]
+; X86-AVX1-NEXT:    vmovaps %xmm6, (%eax) # encoding: [0xc5,0xf8,0x29,0x30]
 ; X86-AVX1-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3209,30 +3247,30 @@ define void @test_MM_TRANSPOSE4_PS(ptr %a0, ptr %a1, ptr %a2, ptr %a3) nounwind
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
-; X86-AVX512-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
-; X86-AVX512-NEXT:    vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a]
-; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11]
-; X86-AVX512-NEXT:    vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18]
-; X86-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
-; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
-; X86-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
-; X86-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; X86-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
-; X86-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
-; X86-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
-; X86-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
-; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
-; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
-; X86-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
-; X86-AVX512-NEXT:    vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16]
+; X86-AVX512-NEXT:    vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
+; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x09]
+; X86-AVX512-NEXT:    vunpckhps %xmm0, %xmm1, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x15,0xd0]
+; X86-AVX512-NEXT:    # xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-AVX512-NEXT:    vmovaps (%edx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x1a]
+; X86-AVX512-NEXT:    vmovaps (%esi), %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x26]
+; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm4, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x15,0xeb]
+; X86-AVX512-NEXT:    # xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X86-AVX512-NEXT:    vunpckhpd %xmm2, %xmm5, %xmm6 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0x15,0xf2]
+; X86-AVX512-NEXT:    # xmm6 = xmm5[1],xmm2[1]
+; X86-AVX512-NEXT:    vmovlhps %xmm2, %xmm5, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd0,0x16,0xd2]
+; X86-AVX512-NEXT:    # xmm2 = xmm5[0],xmm2[0]
+; X86-AVX512-NEXT:    vunpcklps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x14,0xc0]
+; X86-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x14,0xcb]
+; X86-AVX512-NEXT:    # xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-AVX512-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xd8]
+; X86-AVX512-NEXT:    # xmm3 = xmm1[1],xmm0[1]
+; X86-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
+; X86-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X86-AVX512-NEXT:    vmovaps %xmm0, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x06]
 ; X86-AVX512-NEXT:    vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a]
-; X86-AVX512-NEXT:    vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21]
-; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
+; X86-AVX512-NEXT:    vmovaps %xmm2, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x11]
+; X86-AVX512-NEXT:    vmovaps %xmm6, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x30]
 ; X86-AVX512-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/sse-only.ll b/llvm/test/CodeGen/X86/sse-only.ll
index 3814865868d08..5d480d66ad471 100644
--- a/llvm/test/CodeGen/X86/sse-only.ll
+++ b/llvm/test/CodeGen/X86/sse-only.ll
@@ -7,9 +7,9 @@ define void @test1(ptr %r, ptr %A, double %B) nounwind  {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movaps (%ecx), %xmm0
+; CHECK-NEXT:    movaps (%eax), %xmm0
 ; CHECK-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movaps %xmm0, (%eax)
 ; CHECK-NEXT:    retl
 	%tmp3 = load <2 x double>, ptr %A, align 16
diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll
index 03b9e123eea48..38c3e6e316994 100644
--- a/llvm/test/CodeGen/X86/sse-regcall.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall.ll
@@ -80,34 +80,33 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b,
 ; WIN32-NEXT:    movaps %xmm6, %xmm7
 ; WIN32-NEXT:    movaps %xmm5, %xmm6
 ; WIN32-NEXT:    movaps %xmm4, %xmm5
-; WIN32-NEXT:    movaps %xmm1, %xmm4
-; WIN32-NEXT:    movaps %xmm0, %xmm1
+; WIN32-NEXT:    movaps %xmm3, %xmm4
+; WIN32-NEXT:    movaps %xmm0, %xmm3
+; WIN32-NEXT:    mulps %xmm5, %xmm3
 ; WIN32-NEXT:    addps %xmm5, %xmm0
-; WIN32-NEXT:    mulps %xmm5, %xmm1
-; WIN32-NEXT:    subps %xmm1, %xmm0
-; WIN32-NEXT:    movups 8(%ebp), %xmm1
-; WIN32-NEXT:    addps %xmm1, %xmm0
-; WIN32-NEXT:    movaps %xmm4, %xmm1
+; WIN32-NEXT:    subps %xmm3, %xmm0
+; WIN32-NEXT:    movups 8(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm0
+; WIN32-NEXT:    movaps %xmm1, %xmm3
+; WIN32-NEXT:    mulps %xmm6, %xmm3
 ; WIN32-NEXT:    addps %xmm6, %xmm1
-; WIN32-NEXT:    mulps %xmm6, %xmm4
-; WIN32-NEXT:    subps %xmm4, %xmm1
-; WIN32-NEXT:    movups 24(%ebp), %xmm4
-; WIN32-NEXT:    addps %xmm4, %xmm1
-; WIN32-NEXT:    movaps %xmm2, %xmm4
-; WIN32-NEXT:    addps %xmm7, %xmm4
-; WIN32-NEXT:    mulps %xmm7, %xmm2
-; WIN32-NEXT:    subps %xmm2, %xmm4
-; WIN32-NEXT:    movups 40(%ebp), %xmm2
-; WIN32-NEXT:    addps %xmm2, %xmm4
-; WIN32-NEXT:    movaps %xmm3, %xmm5
-; WIN32-NEXT:    movaps (%esp), %xmm2 # 16-byte Reload
-; WIN32-NEXT:    addps %xmm2, %xmm5
-; WIN32-NEXT:    mulps %xmm2, %xmm3
-; WIN32-NEXT:    subps %xmm3, %xmm5
-; WIN32-NEXT:    movups 56(%ebp), %xmm2
-; WIN32-NEXT:    addps %xmm2, %xmm5
-; WIN32-NEXT:    movaps %xmm4, %xmm2
-; WIN32-NEXT:    movaps %xmm5, %xmm3
+; WIN32-NEXT:    subps %xmm3, %xmm1
+; WIN32-NEXT:    movups 24(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm1
+; WIN32-NEXT:    movaps %xmm2, %xmm3
+; WIN32-NEXT:    mulps %xmm7, %xmm3
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    subps %xmm3, %xmm2
+; WIN32-NEXT:    movups 40(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm2
+; WIN32-NEXT:    movaps %xmm4, %xmm3
+; WIN32-NEXT:    movaps (%esp), %xmm5 # 16-byte Reload
+; WIN32-NEXT:    mulps %xmm5, %xmm3
+; WIN32-NEXT:    addps %xmm5, %xmm4
+; WIN32-NEXT:    subps %xmm3, %xmm4
+; WIN32-NEXT:    movups 56(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm4
+; WIN32-NEXT:    movaps %xmm4, %xmm3
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
@@ -198,46 +197,46 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $20, %esp
 ; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    leal (%edx,%edi), %eax
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    subl %edi, %eax
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    subl %ecx, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    imull %edx, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull %eax, %edx
-; WIN32-NEXT:    addl %ebx, %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl (%esp), %edi # 4-byte Reload
-; WIN32-NEXT:    subl %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %edi, %eax
-; WIN32-NEXT:    addl %edx, %eax
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; WIN32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ebx, %esi
+; WIN32-NEXT:    leal (%edx,%edi), %ebp
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    imull %edx, %ecx
+; WIN32-NEXT:    addl %ebp, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull %edx, %ebp
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT:    addl %esi, %ebp
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    addl %eax, %ebp
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    addl %ebp, %edx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ecx, %edx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    imull %esi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    addl %edx, %eax
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
@@ -373,42 +372,42 @@ define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %
 ; WIN32-NEXT:    movaps %xmm2, %xmm3
 ; WIN32-NEXT:    movaps %xmm1, %xmm2
 ; WIN32-NEXT:    movaps %xmm0, %xmm1
-; WIN32-NEXT:    movups 120(%ebp), %xmm7
-; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; WIN32-NEXT:    addps %xmm7, %xmm0
-; WIN32-NEXT:    movups 248(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm0
-; WIN32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; WIN32-NEXT:    movups 8(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 136(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 24(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    movups 152(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    movups 40(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm3
+; WIN32-NEXT:    movups 168(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm3
+; WIN32-NEXT:    movups 56(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm4
+; WIN32-NEXT:    movups 184(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm4
+; WIN32-NEXT:    movups 72(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm5
+; WIN32-NEXT:    movups 200(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm5
+; WIN32-NEXT:    movups 88(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm6
+; WIN32-NEXT:    movups 216(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm6
 ; WIN32-NEXT:    movups 104(%ebp), %xmm7
 ; WIN32-NEXT:    movaps (%esp), %xmm0 # 16-byte Reload
 ; WIN32-NEXT:    addps %xmm7, %xmm0
 ; WIN32-NEXT:    movups 232(%ebp), %xmm7
 ; WIN32-NEXT:    addps %xmm7, %xmm0
 ; WIN32-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
-; WIN32-NEXT:    movups 88(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm6
-; WIN32-NEXT:    movups 216(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm6
-; WIN32-NEXT:    movups 72(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm5
-; WIN32-NEXT:    movups 200(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm5
-; WIN32-NEXT:    movups 56(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm4
-; WIN32-NEXT:    movups 184(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm4
-; WIN32-NEXT:    movups 40(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm3
-; WIN32-NEXT:    movups 168(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm3
-; WIN32-NEXT:    movups 24(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm2
-; WIN32-NEXT:    movups 152(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm2
-; WIN32-NEXT:    movups 8(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm1
-; WIN32-NEXT:    movups 136(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 120(%ebp), %xmm7
+; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movups 248(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movaps %xmm0, %xmm7
 ; WIN32-NEXT:    movaps %xmm1, %xmm0
 ; WIN32-NEXT:    movaps %xmm2, %xmm1
 ; WIN32-NEXT:    movaps %xmm3, %xmm2
@@ -416,7 +415,6 @@ define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %
 ; WIN32-NEXT:    movaps %xmm5, %xmm4
 ; WIN32-NEXT:    movaps %xmm6, %xmm5
 ; WIN32-NEXT:    movaps (%esp), %xmm6 # 16-byte Reload
-; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll
index 6f964f0a88ea3..5307c60f94214 100644
--- a/llvm/test/CodeGen/X86/sse-regcall4.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -80,34 +80,33 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b,
 ; WIN32-NEXT:    movaps %xmm6, %xmm7
 ; WIN32-NEXT:    movaps %xmm5, %xmm6
 ; WIN32-NEXT:    movaps %xmm4, %xmm5
-; WIN32-NEXT:    movaps %xmm1, %xmm4
-; WIN32-NEXT:    movaps %xmm0, %xmm1
+; WIN32-NEXT:    movaps %xmm3, %xmm4
+; WIN32-NEXT:    movaps %xmm0, %xmm3
+; WIN32-NEXT:    mulps %xmm5, %xmm3
 ; WIN32-NEXT:    addps %xmm5, %xmm0
-; WIN32-NEXT:    mulps %xmm5, %xmm1
-; WIN32-NEXT:    subps %xmm1, %xmm0
-; WIN32-NEXT:    movups 8(%ebp), %xmm1
-; WIN32-NEXT:    addps %xmm1, %xmm0
-; WIN32-NEXT:    movaps %xmm4, %xmm1
+; WIN32-NEXT:    subps %xmm3, %xmm0
+; WIN32-NEXT:    movups 8(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm0
+; WIN32-NEXT:    movaps %xmm1, %xmm3
+; WIN32-NEXT:    mulps %xmm6, %xmm3
 ; WIN32-NEXT:    addps %xmm6, %xmm1
-; WIN32-NEXT:    mulps %xmm6, %xmm4
-; WIN32-NEXT:    subps %xmm4, %xmm1
-; WIN32-NEXT:    movups 24(%ebp), %xmm4
-; WIN32-NEXT:    addps %xmm4, %xmm1
-; WIN32-NEXT:    movaps %xmm2, %xmm4
-; WIN32-NEXT:    addps %xmm7, %xmm4
-; WIN32-NEXT:    mulps %xmm7, %xmm2
-; WIN32-NEXT:    subps %xmm2, %xmm4
-; WIN32-NEXT:    movups 40(%ebp), %xmm2
-; WIN32-NEXT:    addps %xmm2, %xmm4
-; WIN32-NEXT:    movaps %xmm3, %xmm5
-; WIN32-NEXT:    movaps (%esp), %xmm2 # 16-byte Reload
-; WIN32-NEXT:    addps %xmm2, %xmm5
-; WIN32-NEXT:    mulps %xmm2, %xmm3
-; WIN32-NEXT:    subps %xmm3, %xmm5
-; WIN32-NEXT:    movups 56(%ebp), %xmm2
-; WIN32-NEXT:    addps %xmm2, %xmm5
-; WIN32-NEXT:    movaps %xmm4, %xmm2
-; WIN32-NEXT:    movaps %xmm5, %xmm3
+; WIN32-NEXT:    subps %xmm3, %xmm1
+; WIN32-NEXT:    movups 24(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm1
+; WIN32-NEXT:    movaps %xmm2, %xmm3
+; WIN32-NEXT:    mulps %xmm7, %xmm3
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    subps %xmm3, %xmm2
+; WIN32-NEXT:    movups 40(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm2
+; WIN32-NEXT:    movaps %xmm4, %xmm3
+; WIN32-NEXT:    movaps (%esp), %xmm5 # 16-byte Reload
+; WIN32-NEXT:    mulps %xmm5, %xmm3
+; WIN32-NEXT:    addps %xmm5, %xmm4
+; WIN32-NEXT:    subps %xmm3, %xmm4
+; WIN32-NEXT:    movups 56(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm3, %xmm4
+; WIN32-NEXT:    movaps %xmm4, %xmm3
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
@@ -198,46 +197,45 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    leal (%edi,%ebp), %ebx
+; WIN32-NEXT:    imull %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    leal (%eax,%esi), %ecx
-; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    subl %esi, %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    subl %edx, %eax
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    imull %eax, %ebp
+; WIN32-NEXT:    leal (%ebp,%eax), %edi
+; WIN32-NEXT:    leal (%ecx,%edx), %eax
+; WIN32-NEXT:    imull %edi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    imull (%esp), %ebx # 4-byte Folded Reload
+; WIN32-NEXT:    addl %eax, %ebx
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    addl %ebp, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl %ebp, %ebx
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %ebx, %eax
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    imull %esi, %edi
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; WIN32-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; WIN32-NEXT:    imull %ebp, %ecx
-; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    addl %eax, %edi
-; WIN32-NEXT:    movl %edi, %ecx
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull %edi, %edx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
@@ -372,42 +370,42 @@ define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %
 ; WIN32-NEXT:    movaps %xmm2, %xmm3
 ; WIN32-NEXT:    movaps %xmm1, %xmm2
 ; WIN32-NEXT:    movaps %xmm0, %xmm1
-; WIN32-NEXT:    movups 120(%ebp), %xmm7
-; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; WIN32-NEXT:    addps %xmm7, %xmm0
-; WIN32-NEXT:    movups 248(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm0
-; WIN32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; WIN32-NEXT:    movups 8(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 136(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 24(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    movups 152(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    movups 40(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm3
+; WIN32-NEXT:    movups 168(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm3
+; WIN32-NEXT:    movups 56(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm4
+; WIN32-NEXT:    movups 184(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm4
+; WIN32-NEXT:    movups 72(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm5
+; WIN32-NEXT:    movups 200(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm5
+; WIN32-NEXT:    movups 88(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm6
+; WIN32-NEXT:    movups 216(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm6
 ; WIN32-NEXT:    movups 104(%ebp), %xmm7
 ; WIN32-NEXT:    movaps (%esp), %xmm0 # 16-byte Reload
 ; WIN32-NEXT:    addps %xmm7, %xmm0
 ; WIN32-NEXT:    movups 232(%ebp), %xmm7
 ; WIN32-NEXT:    addps %xmm7, %xmm0
 ; WIN32-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
-; WIN32-NEXT:    movups 88(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm6
-; WIN32-NEXT:    movups 216(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm6
-; WIN32-NEXT:    movups 72(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm5
-; WIN32-NEXT:    movups 200(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm5
-; WIN32-NEXT:    movups 56(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm4
-; WIN32-NEXT:    movups 184(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm4
-; WIN32-NEXT:    movups 40(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm3
-; WIN32-NEXT:    movups 168(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm3
-; WIN32-NEXT:    movups 24(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm2
-; WIN32-NEXT:    movups 152(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm2
-; WIN32-NEXT:    movups 8(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm1
-; WIN32-NEXT:    movups 136(%ebp), %xmm7
-; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 120(%ebp), %xmm7
+; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movups 248(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movaps %xmm0, %xmm7
 ; WIN32-NEXT:    movaps %xmm1, %xmm0
 ; WIN32-NEXT:    movaps %xmm2, %xmm1
 ; WIN32-NEXT:    movaps %xmm3, %xmm2
@@ -415,7 +413,6 @@ define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %
 ; WIN32-NEXT:    movaps %xmm5, %xmm4
 ; WIN32-NEXT:    movaps %xmm6, %xmm5
 ; WIN32-NEXT:    movaps (%esp), %xmm6 # 16-byte Reload
-; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 5005752b06de4..b82a1c034197f 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -14,15 +14,26 @@
 ; vector that this ends up returning.
 ; rdar://8368414
 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
-; CHECK-LABEL: test4:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movaps %xmm0, %xmm2
-; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
-; CHECK-NEXT:    addss %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; CHECK-NEXT:    subss %xmm1, %xmm2
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: test4:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movaps %xmm0, %xmm2
+; X86-NEXT:    addss %xmm1, %xmm2
+; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    subss %xmm1, %xmm0
+; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-NEXT:    movaps %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: test4:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movaps %xmm0, %xmm2
+; X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; X64-NEXT:    addss %xmm1, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X64-NEXT:    subss %xmm1, %xmm2
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    retq
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -44,41 +55,41 @@ entry:
 define <4 x float> @vselect(ptr%p, <4 x i32> %q) {
 ; X86-LABEL: vselect:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_1
 ; X86-NEXT:  # %bb.2: # %entry
-; X86-NEXT:    xorps %xmm1, %xmm1
+; X86-NEXT:    xorps %xmm2, %xmm2
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB1_5
 ; X86-NEXT:  .LBB1_4:
-; X86-NEXT:    movss {{.*#+}} xmm2 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB1_8
 ; X86-NEXT:  .LBB1_7:
-; X86-NEXT:    movss {{.*#+}} xmm3 = [4.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT:    movss {{.*#+}} xmm3 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_10
 ; X86-NEXT:    jmp .LBB1_11
 ; X86-NEXT:  .LBB1_1:
-; X86-NEXT:    movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; X86-NEXT:    movss {{.*#+}} xmm2 = [4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_4
 ; X86-NEXT:  .LBB1_5: # %entry
-; X86-NEXT:    xorps %xmm2, %xmm2
+; X86-NEXT:    xorps %xmm1, %xmm1
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_7
 ; X86-NEXT:  .LBB1_8: # %entry
 ; X86-NEXT:    xorps %xmm3, %xmm3
-; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB1_11
 ; X86-NEXT:  .LBB1_10:
 ; X86-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:  .LBB1_11: # %entry
-; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vselect:
@@ -144,35 +155,31 @@ define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
 define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-LABEL: PR30512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sete %bl
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sete %bl
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sete %bl
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sete %dl
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, (%esp)
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl %ecx, (%esp)
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -181,11 +188,9 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; X86-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm2, (%eax)
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: PR30512:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index a48be037ebebc..f9a1c2dde6f7d 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3062,15 +3062,35 @@ define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
-; AVX512-LABEL: test_mm_mul_epu32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
-; AVX512-NEXT:    vpblendd $10, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc2,0x0a]
-; AVX512-NEXT:    # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512-NEXT:    vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a]
-; AVX512-NEXT:    # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
-; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; X86-AVX512-LABEL: test_mm_mul_epu32:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; X86-AVX512-NEXT:    vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a]
+; X86-AVX512-NEXT:    # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; X86-AVX512-NEXT:    vpblendd $10, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc2,0x0a]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; X86-AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; X86-AVX512-NEXT:    retl # encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_mm_mul_epu32:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; X64-AVX512-NEXT:    vpblendd $10, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc2,0x0a]
+; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; X64-AVX512-NEXT:    vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a]
+; X64-AVX512-NEXT:    # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; X64-AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_mul_epu32:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; X32-AVX512-NEXT:    vpblendd $10, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x02,0xc2,0x0a]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; X32-AVX512-NEXT:    vpblendd $10, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x02,0xca,0x0a]
+; X32-AVX512-NEXT:    # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; X32-AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %A = and <2 x i64> %a0, <i64 4294967295, i64 4294967295>
   %B = and <2 x i64> %a1, <i64 4294967295, i64 4294967295>
   %res = mul nuw <2 x i64> %A, %B
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 6c94f82ac40d9..9a95453c81543 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -274,25 +274,25 @@ define void @test_x86_sse2_storeu_dq(ptr %a0, <16 x i8> %a1) {
   ; add operation forces the execution domain.
 ; X86-SSE-LABEL: test_x86_sse2_storeu_dq:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    pcmpeqd %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x76,0xc9]
 ; X86-SSE-NEXT:    psubb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf8,0xc1]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movdqu %xmm0, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x00]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_x86_sse2_storeu_dq:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf8,0xc1]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovdqu %xmm0, (%eax) ## encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse2_storeu_dq:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
 ; X86-AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf8,0xc1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovdqu %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x00]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -327,34 +327,34 @@ define void @test_x86_sse2_storeu_pd(ptr %a0, <2 x double> %a1) {
   ; fadd operation forces the execution domain.
 ; X86-SSE-LABEL: test_x86_sse2_storeu_pd:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x57,0xc9]
 ; X86-SSE-NEXT:    movhpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A]
 ; X86-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-SSE-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X86-SSE-NEXT:    addpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x58,0xc8]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movupd %xmm1, (%eax) ## encoding: [0x66,0x0f,0x11,0x08]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_x86_sse2_storeu_pd:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX1-NEXT:    vmovhpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
 ; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX1-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X86-AVX1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovupd %xmm0, (%eax) ## encoding: [0xc5,0xf9,0x11,0x00]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse2_storeu_pd:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9]
 ; X86-AVX512-NEXT:    vmovhpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A]
 ; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512-NEXT:    ## xmm1 = xmm1[0],mem[0]
 ; X86-AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovupd %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x00]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index 6e77d3e4fd134..f7269f36994a7 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -14,18 +14,18 @@ define void @test1(ptr %r, ptr %A, double %B) nounwind  {
 ; X86-SSE-LABEL: test1:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
 ; X86-SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test1:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
 ; X86-AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -51,18 +51,18 @@ define void @test2(ptr %r, ptr %A, double %B) nounwind  {
 ; X86-SSE-LABEL: test2:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test2:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
 ; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -91,20 +91,20 @@ define void @test3(ptr %res, ptr %A, ptr %B) nounwind {
 ; X86-SSE-LABEL: test3:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movaps (%edx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test3:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -138,15 +138,15 @@ define void @test3(ptr %res, ptr %A, ptr %B) nounwind {
 define void @test4(<4 x float> %X, ptr %res) nounwind {
 ; X86-SSE-LABEL: test4:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test4:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -222,16 +222,16 @@ define void @test6(ptr %res, ptr %A) nounwind {
 ; X86-SSE-LABEL: test6:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test6:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -392,37 +392,68 @@ define <2 x double> @test11(double %a, double %b) nounwind {
 }
 
 define void @test12() nounwind {
-; SSE-LABEL: test12:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movapd 0, %xmm0
-; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
-; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps %xmm2, 0
-; SSE-NEXT:    ret{{[l|q]}}
+; X86-SSE-LABEL: test12:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    movaps 0, %xmm1
+; X86-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X86-SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
+; X86-SSE-NEXT:    addps %xmm0, %xmm1
+; X86-SSE-NEXT:    movaps %xmm1, 0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test12:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovaps 0, %xmm1
+; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
+; X86-AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovaps %xmm0, 0
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX512-LABEL: test12:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vmovaps 0, %xmm1
+; X86-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X86-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
+; X86-AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X86-AVX512-NEXT:    vmovaps %xmm0, 0
+; X86-AVX512-NEXT:    retl
+;
+; X64-SSE-LABEL: test12:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movapd 0, %xmm0
+; X64-SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X64-SSE-NEXT:    xorps %xmm2, %xmm2
+; X64-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X64-SSE-NEXT:    addps %xmm1, %xmm2
+; X64-SSE-NEXT:    movaps %xmm2, 0
+; X64-SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test12:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps 0, %xmm0
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
-; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovaps %xmm0, 0
-; AVX1-NEXT:    ret{{[l|q]}}
-;
-; AVX512-LABEL: test12:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps 0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, 0
-; AVX512-NEXT:    ret{{[l|q]}}
+; X64-AVX1-LABEL: test12:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovaps 0, %xmm0
+; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; X64-AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vmovaps %xmm0, 0
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX512-LABEL: test12:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovaps 0, %xmm0
+; X64-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X64-AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; X64-AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-AVX512-NEXT:    vmovaps %xmm0, 0
+; X64-AVX512-NEXT:    retq
   %tmp1 = load <4 x float>, ptr null          ; <<4 x float>> [#uses=2]
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
   %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
@@ -435,22 +466,22 @@ define void @test13(ptr %res, ptr %A, ptr %B, ptr %C) nounwind {
 ; X86-SSE-LABEL: test13:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movaps (%edx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
 ; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test13:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
 ; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -480,24 +511,24 @@ define <4 x float> @test14(ptr %x, ptr %y) nounwind {
 ; X86-SSE-LABEL: test14:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm1
-; X86-SSE-NEXT:    movaps (%eax), %xmm2
-; X86-SSE-NEXT:    movaps %xmm2, %xmm0
-; X86-SSE-NEXT:    addps %xmm1, %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, %xmm2
 ; X86-SSE-NEXT:    subps %xmm1, %xmm2
+; X86-SSE-NEXT:    addps %xmm1, %xmm0
 ; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test14:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps (%eax), %xmm1
-; X86-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
-; X86-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; X86-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: test14:
@@ -530,16 +561,16 @@ define <4 x float> @test15(ptr %x, ptr %y) nounwind {
 ; X86-SSE-LABEL: test15:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test15:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
 ; X86-AVX-NEXT:    retl
 ;
@@ -703,7 +734,5 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %m
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-AVX1: {{.*}}
-; X64-AVX512: {{.*}}
-; X86-AVX1: {{.*}}
-; X86-AVX512: {{.*}}
+; AVX1: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
index 1a4df9a175ffa..969b00dd83563 100644
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -10,11 +10,11 @@
 define void @t0(ptr %dest, ptr %old) nounwind {
 ; X86-LABEL: t0:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    movd %edx, %xmm0
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -38,10 +38,10 @@ define <8 x i16> @t1(ptr %A, ptr %B) nounwind {
 ; X86-LABEL: t1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-NEXT:    movaps %xmm0, %xmm1
-; X86-NEXT:    andnps (%ecx), %xmm1
+; X86-NEXT:    andnps (%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andps (%eax), %xmm0
 ; X86-NEXT:    orps %xmm1, %xmm0
 ; X86-NEXT:    retl
@@ -175,9 +175,9 @@ define void @t8(ptr %res, ptr %A) nounwind {
 ; X86-LABEL: t8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7]
 ; X86-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -214,10 +214,10 @@ define void @t9(ptr %r, ptr %A) nounwind {
 ; X86-LABEL: t9:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movaps (%eax), %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
 ; X86-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; X86-NEXT:    movaps %xmm0, (%ecx)
+; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t9:
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index c6f0ec493a36c..bab031af61201 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -838,15 +838,6 @@ define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    ret{{[l|q]}}
-;
-; AVX512-LABEL: test_mm_mul_epi32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
-; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    ret{{[l|q]}}
   %A = shl <2 x i64> %a0, <i64 32, i64 32>
   %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32>
   %B = shl <2 x i64> %a1, <i64 32, i64 32>
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 3960583c46fef..a5f5c59dac9fa 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -509,41 +509,77 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 ; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
 ; pointless.
 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
-; SSE-LABEL: buildvector:
-; SSE:       ## %bb.0: ## %entry
-; SSE-NEXT:    movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0]
-; SSE-NEXT:    ## xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT:    movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9]
-; SSE-NEXT:    ## xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT:    addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda]
-; SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
-; SSE-NEXT:    insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10]
-; SSE-NEXT:    ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-SSE-LABEL: buildvector:
+; X86-SSE:       ## %bb.0: ## %entry
+; X86-SSE-NEXT:    movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0]
+; X86-SSE-NEXT:    ## xmm2 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
+; X86-SSE-NEXT:    movshdup %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x16,0xc9]
+; X86-SSE-NEXT:    ## xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    addss %xmm2, %xmm1 ## encoding: [0xf3,0x0f,0x58,0xca]
+; X86-SSE-NEXT:    insertps $16, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x10]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; AVX1-LABEL: buildvector:
-; AVX1:       ## %bb.0: ## %entry
-; AVX1-NEXT:    vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0]
-; AVX1-NEXT:    ## xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9]
-; AVX1-NEXT:    ## xmm3 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3]
-; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
-; AVX1-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
-; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX1-LABEL: buildvector:
+; X86-AVX1:       ## %bb.0: ## %entry
+; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x58,0xd1]
+; X86-AVX1-NEXT:    vmovshdup %xmm1, %xmm1 ## encoding: [0xc5,0xfa,0x16,0xc9]
+; X86-AVX1-NEXT:    ## xmm1 = xmm1[1,1,3,3]
+; X86-AVX1-NEXT:    vmovshdup %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x16,0xc0]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[1,1,3,3]
+; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
+; X86-AVX1-NEXT:    vinsertps $16, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x21,0xc0,0x10]
+; X86-AVX1-NEXT:    ## xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
-; AVX512-LABEL: buildvector:
-; AVX512:       ## %bb.0: ## %entry
-; AVX512-NEXT:    vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]
-; AVX512-NEXT:    ## xmm2 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9]
-; AVX512-NEXT:    ## xmm3 = xmm1[1,1,3,3]
-; AVX512-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3]
-; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
-; AVX512-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
-; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX512-LABEL: buildvector:
+; X86-AVX512:       ## %bb.0: ## %entry
+; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xd1]
+; X86-AVX512-NEXT:    vmovshdup %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xc9]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[1,1,3,3]
+; X86-AVX512-NEXT:    vmovshdup %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xc0]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[1,1,3,3]
+; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
+; X86-AVX512-NEXT:    vinsertps $16, %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc0,0x10]
+; X86-AVX512-NEXT:    ## xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: buildvector:
+; X64-SSE:       ## %bb.0: ## %entry
+; X64-SSE-NEXT:    movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0]
+; X64-SSE-NEXT:    ## xmm2 = xmm0[1,1,3,3]
+; X64-SSE-NEXT:    movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9]
+; X64-SSE-NEXT:    ## xmm3 = xmm1[1,1,3,3]
+; X64-SSE-NEXT:    addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda]
+; X64-SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
+; X64-SSE-NEXT:    insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: buildvector:
+; X64-AVX1:       ## %bb.0: ## %entry
+; X64-AVX1-NEXT:    vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0]
+; X64-AVX1-NEXT:    ## xmm2 = xmm0[1,1,3,3]
+; X64-AVX1-NEXT:    vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9]
+; X64-AVX1-NEXT:    ## xmm3 = xmm1[1,1,3,3]
+; X64-AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3]
+; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
+; X64-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: buildvector:
+; X64-AVX512:       ## %bb.0: ## %entry
+; X64-AVX512-NEXT:    vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]
+; X64-AVX512-NEXT:    ## xmm2 = xmm0[1,1,3,3]
+; X64-AVX512-NEXT:    vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9]
+; X64-AVX512-NEXT:    ## xmm3 = xmm1[1,1,3,3]
+; X64-AVX512-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3]
+; X64-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
+; X64-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -1233,31 +1269,57 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
-; SSE-LABEL: i32_shuf_X00X:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
-; SSE-NEXT:    pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00]
-; SSE-NEXT:    ## xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT:    pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
-; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-SSE-LABEL: i32_shuf_X00X:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    pshufd $0, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0x00]
+; X86-SSE-NEXT:    ## xmm1 = xmm0[0,0,0,0]
+; X86-SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
+; X86-SSE-NEXT:    pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3]
+; X86-SSE-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; AVX1-LABEL: i32_shuf_X00X:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
-; AVX1-NEXT:    ## xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
-; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX1-LABEL: i32_shuf_X00X:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,0,0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; X86-AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
-; AVX512-LABEL: i32_shuf_X00X:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
-; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
-; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
-; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX512-LABEL: i32_shuf_X00X:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
+; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; X86-AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: i32_shuf_X00X:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
+; X64-SSE-NEXT:    pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,0,0,0]
+; X64-SSE-NEXT:    pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
+; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: i32_shuf_X00X:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; X64-AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,0,0,0]
+; X64-AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: i32_shuf_X00X:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
+; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
+; X64-AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -1267,35 +1329,65 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
 }
 
 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
-; SSE-LABEL: i32_shuf_X0YC:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
-; SSE-NEXT:    ## xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE-NEXT:    pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa]
-; SSE-NEXT:    ## xmm0 = xmm1[2,2,2,2]
-; SSE-NEXT:    pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
-; SSE-NEXT:    ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-SSE-LABEL: i32_shuf_X0YC:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    pshufd $170, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xaa]
+; X86-SSE-NEXT:    ## xmm1 = xmm1[2,2,2,2]
+; X86-SSE-NEXT:    pmovzxdq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x35,0xc0]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; AVX1-LABEL: i32_shuf_X0YC:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
-; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
-; AVX1-NEXT:    ## xmm1 = xmm1[2,2,2,2]
-; AVX1-NEXT:    vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
-; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX1-LABEL: i32_shuf_X0YC:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
+; X86-AVX1-NEXT:    ## xmm1 = xmm1[2,2,2,2]
+; X86-AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-AVX1-NEXT:    vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
-; AVX512-LABEL: i32_shuf_X0YC:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
-; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512-NEXT:    vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
-; AVX512-NEXT:    ## xmm1 = xmm1[2,2,2,2]
-; AVX512-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
-; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+; X86-AVX512-LABEL: i32_shuf_X0YC:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[2,2,2,2]
+; X86-AVX512-NEXT:    vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-AVX512-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: i32_shuf_X0YC:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
+; X64-SSE-NEXT:    ## xmm2 = xmm0[0],zero,xmm0[1],zero
+; X64-SSE-NEXT:    pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa]
+; X64-SSE-NEXT:    ## xmm0 = xmm1[2,2,2,2]
+; X64-SSE-NEXT:    pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
+; X64-SSE-NEXT:    ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: i32_shuf_X0YC:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-AVX1-NEXT:    vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
+; X64-AVX1-NEXT:    ## xmm1 = xmm1[2,2,2,2]
+; X64-AVX1-NEXT:    vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: i32_shuf_X0YC:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-AVX512-NEXT:    vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa]
+; X64-AVX512-NEXT:    ## xmm1 = xmm1[2,2,2,2]
+; X64-AVX512-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -1474,30 +1566,30 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocaptu
 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocapture readonly %pb, i64 %index) {
 ; X86-SSE-LABEL: insertps_from_vector_load_offset_2:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-SSE-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-SSE-NEXT:    movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-SSE-NEXT:    shll $4, %eax ## encoding: [0xc1,0xe0,0x04]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-SSE-NEXT:    movaps (%ecx,%eax), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x01]
 ; X86-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
 ; X86-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_vector_load_offset_2:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-AVX1-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-AVX1-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-AVX1-NEXT:    shll $4, %eax ## encoding: [0xc1,0xe0,0x04]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-AVX1-NEXT:    vmovaps (%ecx,%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x01]
 ; X86-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_vector_load_offset_2:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; X86-AVX512-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
-; X86-AVX512-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-AVX512-NEXT:    shll $4, %eax ## encoding: [0xc1,0xe0,0x04]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-AVX512-NEXT:    vmovaps (%ecx,%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x01]
 ; X86-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
@@ -1641,17 +1733,17 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-SSE-NEXT:    movss (%ecx,%eax,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x24,0x81]
-; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
-; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
+; X86-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
+; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
+; X86-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
+; X86-SSE-NEXT:    addps %xmm3, %xmm2 ## encoding: [0x0f,0x58,0xd3]
 ; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
 ; X86-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
+; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
 ; X86-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
-; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
-; X86-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
-; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
-; X86-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
-; X86-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
-; X86-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
+; X86-SSE-NEXT:    addps %xmm2, %xmm0 ## encoding: [0x0f,0x58,0xc2]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use:
@@ -1659,17 +1751,17 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X86-AVX1-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X86-AVX1-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vaddps %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x58,0xd3]
 ; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
 ; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
-; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
-; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
-; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
-; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
+; X86-AVX1-NEXT:    vaddps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc2]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use:
@@ -1677,17 +1769,17 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
-; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
 ; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
 ; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xd3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
-; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
+; X86-AVX512-NEXT:    vaddps %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc2]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:
@@ -1869,31 +1961,31 @@ define <4 x float> @pr20087(<4 x float> %a, ptr%ptr) {
 define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, ptr noalias nocapture %RET) #1 {
 ; X86-SSE-LABEL: insertps_pr20411:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee]
 ; X86-SSE-NEXT:    ## xmm1 = xmm1[2,3,2,3]
 ; X86-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
 ; X86-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_pr20411:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
 ; X86-AVX1-NEXT:    ## xmm1 = xmm1[2,3,2,3]
 ; X86-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_pr20411:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee]
 ; X86-AVX512-NEXT:    ## xmm1 = xmm1[2,3,2,3]
 ; X86-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll
index 3baa177f548e3..f933637cac0cc 100644
--- a/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -33,9 +33,9 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
 define i32 @test_x86_sse42_pcmpestri128_load(ptr %a0, ptr %a2) {
 ; X86-SSE-LABEL: test_x86_sse42_pcmpestri128_load:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movdqa (%eax), %xmm0 ## encoding: [0x66,0x0f,0x6f,0x00]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-SSE-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
 ; X86-SSE-NEXT:    pcmpestri $7, (%ecx), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0x01,0x07]
@@ -44,9 +44,9 @@ define i32 @test_x86_sse42_pcmpestri128_load(ptr %a0, ptr %a2) {
 ;
 ; X86-AVX1-LABEL: test_x86_sse42_pcmpestri128_load:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovdqa (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x00]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX1-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX1-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
 ; X86-AVX1-NEXT:    vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07]
@@ -55,9 +55,9 @@ define i32 @test_x86_sse42_pcmpestri128_load(ptr %a0, ptr %a2) {
 ;
 ; X86-AVX512-LABEL: test_x86_sse42_pcmpestri128_load:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-AVX512-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; X86-AVX512-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
 ; X86-AVX512-NEXT:    vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07]
@@ -424,27 +424,27 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
 define i32 @test_x86_sse42_pcmpistri128_load(ptr %a0, ptr %a1) {
 ; X86-SSE-LABEL: test_x86_sse42_pcmpistri128_load:
 ; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movdqa (%eax), %xmm0 ## encoding: [0x66,0x0f,0x6f,0x00]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    movdqa (%ecx), %xmm0 ## encoding: [0x66,0x0f,0x6f,0x01]
 ; X86-SSE-NEXT:    pcmpistri $7, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0x00,0x07]
 ; X86-SSE-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_x86_sse42_pcmpistri128_load:
 ; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vmovdqa (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x00]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    vmovdqa (%ecx), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x01]
 ; X86-AVX1-NEXT:    vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07]
 ; X86-AVX1-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse42_pcmpistri128_load:
 ; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    vmovdqa (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x01]
 ; X86-AVX512-NEXT:    vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07]
 ; X86-AVX512-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index 469b62453f37f..37488c062ee69 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -32,18 +32,18 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %edi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testw %si, %si
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %si
-; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -77,9 +77,9 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movswl %dx, %esi
@@ -128,9 +128,9 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
-; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movswl %dx, %esi
@@ -179,9 +179,9 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shlb $4, %dl
 ; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    shlb %cl, %ch
@@ -288,9 +288,9 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shll $14, %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, %edi
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 15069e558b08d..4b48adaf5471e 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -66,82 +66,84 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sarl %cl, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovel %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    shldl %cl, %esi, %ebx
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %eax, %ebx
-; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    sarl %cl, %esi
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb $32, %ch
 ; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    cmovnel %edx, %ebx
+; X86-NEXT:    cmovnel %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    sarl %cl, %ebp
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    cmovel %ebp, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edx, %eax
 ; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    testb $32, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmovnel %ebp, %edx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    xorl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    xorl $2147483647, %ebp # imm = 0x7FFFFFFF
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    notl %ecx
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    xorl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    notl %ebp
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    cmovel %ebx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    xorl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %ebp, 12(%ecx)
+; X86-NEXT:    cmovel (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    notl %esi
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -205,69 +207,63 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    sarl %cl, %ebp
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl %ebp, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sarl %cl, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl %eax, %edi
-; X86-NEXT:    cmovel %ebp, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sarl %cl, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl %cl, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sarl %cl, %ebp
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl %ebp, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl %cl, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sarl %cl, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmpl %ebp, %esi
-; X86-NEXT:    cmovel %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, 12(%esi)
-; X86-NEXT:    movl %eax, 8(%esi)
-; X86-NEXT:    movl %edx, 4(%esi)
-; X86-NEXT:    movl %ebx, (%esi)
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
@@ -358,130 +354,115 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ;
 ; X86-LABEL: vec_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movswl %di, %ebp
-; X86-NEXT:    sarl %cl, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movswl %dx, %esi
+; X86-NEXT:    sarl %cl, %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %bx, %bx
+; X86-NEXT:    testw %ax, %ax
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %bp, %bx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    cmovel %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movswl %si, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %bx, %bx
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    cmpw %di, %bx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %esi, %ebp
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %esi
-; X86-NEXT:    sarl %cl, %esi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testw %di, %di
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %di
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movw %cx, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movswl %ax, %edx
-; X86-NEXT:    sarl %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %si, %si
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %dx, %si
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    sarl %cl, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %dx, %dx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movw %cx, 8(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movswl %dx, %eax
-; X86-NEXT:    sarl %cl, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %si, %si
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %ax, %si
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovel %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movw %cx, 6(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movswl %ax, %esi
-; X86-NEXT:    sarl %cl, %esi
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
+; X86-NEXT:    sarl %cl, %edi
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    testw %dx, %dx
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    cmovel %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movw %cx, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movswl %ax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
 ; X86-NEXT:    sarl %cl, %edi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testw %si, %si
-; X86-NEXT:    sets %dl
-; X86-NEXT:    addl $32767, %edx # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %si
-; X86-NEXT:    cmovel %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testw %dx, %dx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
+; X86-NEXT:    movw %cx, 2(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movswl %ax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movswl %si, %edi
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testw %si, %si
+; X86-NEXT:    testw %dx, %dx
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmpw %di, %si
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movw %cx, 14(%eax)
-; X86-NEXT:    movw %dx, 12(%eax)
-; X86-NEXT:    movw %bx, 10(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movw %cx, 4(%eax)
-; X86-NEXT:    movw %bp, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %ecx
 ; X86-NEXT:    movw %cx, (%eax)
-; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %tmp
@@ -625,252 +606,217 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ;
 ; X86-LABEL: vec_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb %ch, %bh
-; X86-NEXT:    shlb %cl, %bh
-; X86-NEXT:    movzbl %bh, %esi
-; X86-NEXT:    sarb %cl, %bh
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testb %ch, %ch
-; X86-NEXT:    sets %al
-; X86-NEXT:    addl $127, %eax
-; X86-NEXT:    cmpb %bh, %ch
-; X86-NEXT:    cmovel %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    shlb %cl, %al
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    sarb %cl, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shlb %cl, %dl
+; X86-NEXT:    movzbl %dl, %esi
+; X86-NEXT:    sarb %cl, %dl
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %bl, %bl
+; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %al, %bl
+; X86-NEXT:    cmpb %dl, %al
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %dh, %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shlb %cl, %al
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    sarb %cl, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %dh, %dh
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %al, %dh
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb %ah, %al
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shlb %cl, %al
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    sarb %cl, %al
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %ah, %ah
-; X86-NEXT:    sets %dl
-; X86-NEXT:    addl $127, %edx
-; X86-NEXT:    cmpb %al, %ah
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %cl, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %cl, 11(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %cl, 10(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %cl, 9(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %cl, 8(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movb %cl, 7(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movb %cl, 6(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    sarb %cl, %dl
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    sets %bl
-; X86-NEXT:    addl $127, %ebx
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovel %esi, %ebx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movzbl %ah, %esi
-; X86-NEXT:    sarb %cl, %ah
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    sets %dl
-; X86-NEXT:    addl $127, %edx
-; X86-NEXT:    cmpb %ah, %al
-; X86-NEXT:    cmovel %esi, %edx
+; X86-NEXT:    movb %cl, 5(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movzbl %ah, %esi
-; X86-NEXT:    sarb %cl, %ah
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    cmpb %ah, %al
+; X86-NEXT:    cmpb %dh, %dl
 ; X86-NEXT:    cmovel %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %bl, 13(%eax)
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    cmpb %dh, %dl
+; X86-NEXT:    cmovel %esi, %ecx
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    cmpb %dh, %dl
+; X86-NEXT:    cmovel %esi, %ecx
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    cmpb %dh, %dl
+; X86-NEXT:    cmovel %esi, %ecx
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %dh
+; X86-NEXT:    shlb %cl, %dh
+; X86-NEXT:    movzbl %dh, %esi
+; X86-NEXT:    sarb %cl, %dh
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    sets %cl
+; X86-NEXT:    addl $127, %ecx
+; X86-NEXT:    cmpb %dh, %dl
+; X86-NEXT:    cmovel %esi, %ecx
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %tmp
diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll
index 8ecc8b39ac468..06dbdd43b4954 100644
--- a/llvm/test/CodeGen/X86/ssub_sat.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat.ll
@@ -12,8 +12,8 @@ declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpl %edx, %eax
 ; X86-NEXT:    setns %cl
@@ -71,8 +71,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpw %dx, %ax
 ; X86-NEXT:    setns %cl
@@ -102,11 +102,11 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmpb %al, %dl
 ; X86-NEXT:    setns %cl
 ; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    subb %dl, %al
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movzbl %dl, %eax
 ; X86-NEXT:    cmovol %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
@@ -129,14 +129,14 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    cmpb $7, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    cmovll %ecx, %eax
-; X86-NEXT:    cmpb $-7, %al
+; X86-NEXT:    cmpb $7, %cl
+; X86-NEXT:    cmovll %edx, %eax
 ; X86-NEXT:    movl $248, %ecx
+; X86-NEXT:    cmpb $-7, %al
 ; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    movsbl %cl, %eax
 ; X86-NEXT:    retl
@@ -163,43 +163,43 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    setns %al
 ; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    subl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovol %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %esi, %edx
 ; X86-NEXT:    setns %al
 ; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    subl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovol %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edi, %esi
 ; X86-NEXT:    setns %al
 ; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X86-NEXT:    subl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    cmovol %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edi, %ebx
+; X86-NEXT:    setns %al
+; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    cmovol %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl %eax, %edi
-; X86-NEXT:    setns %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovol %ebx, %edi
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/ssub_sat_plus.ll b/llvm/test/CodeGen/X86/ssub_sat_plus.ll
index 5baf7a1dac74c..1dd34e4f3b712 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_plus.ll
@@ -11,9 +11,9 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: func32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpl %edx, %eax
 ; X86-NEXT:    setns %cl
@@ -74,9 +74,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imulw {{[0-9]+}}(%esp), %dx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    cmpw %dx, %ax
 ; X86-NEXT:    setns %cl
@@ -140,20 +140,20 @@ define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind {
 define signext i4 @func4(i4 signext %x, i4 signext %y, i4 signext %z) nounwind {
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mulb {{[0-9]+}}(%esp)
 ; X86-NEXT:    shlb $4, %al
 ; X86-NEXT:    sarb $4, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl $7, %eax
 ; X86-NEXT:    cmpb $7, %cl
-; X86-NEXT:    movl $7, %ecx
-; X86-NEXT:    cmovll %eax, %ecx
-; X86-NEXT:    cmpb $-7, %cl
-; X86-NEXT:    movl $248, %eax
-; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    movl $248, %ecx
+; X86-NEXT:    cmpb $-7, %al
+; X86-NEXT:    cmovgel %eax, %ecx
+; X86-NEXT:    movsbl %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func4:
diff --git a/llvm/test/CodeGen/X86/stack-align-memcpy.ll b/llvm/test/CodeGen/X86/stack-align-memcpy.ll
index 7eb1f7980bf62..e59553629e221 100644
--- a/llvm/test/CodeGen/X86/stack-align-memcpy.ll
+++ b/llvm/test/CodeGen/X86/stack-align-memcpy.ll
@@ -19,55 +19,55 @@ define void @test1(ptr nocapture %x, i32 %y) nounwind {
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $80, %esp
 ; CHECK-NEXT:    movl %esp, %esi
-; CHECK-NEXT:    movl 8(%ebp), %ecx
-; CHECK-NEXT:    movl 12(%ebp), %edx
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    addl $15, %edx
-; CHECK-NEXT:    andl $-16, %edx
-; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    addl $15, %ecx
+; CHECK-NEXT:    andl $-16, %ecx
+; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    movl %eax, %esp
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    movl 84(%ecx), %edi
-; CHECK-NEXT:    movl 80(%ecx), %ebx
-; CHECK-NEXT:    movl 76(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 68(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 72(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 64(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 68(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 60(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 64(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 56(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 60(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 52(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 56(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 48(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 52(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 44(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 48(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 40(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 44(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 36(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 40(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 32(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 36(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 28(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 32(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 24(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 28(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 20(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 24(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 16(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 20(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 12(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 16(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 8(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 12(%ecx), %edx
-; CHECK-NEXT:    movl %edx, 4(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 8(%ecx), %edx
-; CHECK-NEXT:    movl %edx, (%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl (%ecx), %edx
-; CHECK-NEXT:    movl %edx, 72(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 4(%ecx), %ecx
+; CHECK-NEXT:    movl 8(%ebp), %edx
+; CHECK-NEXT:    movl 84(%edx), %edi
+; CHECK-NEXT:    movl 80(%edx), %ebx
+; CHECK-NEXT:    movl 76(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 68(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 72(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 64(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 68(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 60(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 64(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 56(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 60(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 52(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 56(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 48(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 52(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 44(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 48(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 40(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 44(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 36(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 40(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 32(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 36(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 28(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 32(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 24(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 28(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 20(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 24(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 16(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 20(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 12(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 16(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 8(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 12(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 4(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 8(%edx), %ecx
+; CHECK-NEXT:    movl %ecx, (%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl (%edx), %ecx
+; CHECK-NEXT:    movl %ecx, 72(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 4(%edx), %edx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
@@ -88,7 +88,7 @@ define void @test1(ptr nocapture %x, i32 %y) nounwind {
 ; CHECK-NEXT:    pushl 8(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 4(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl (%esi) ## 4-byte Folded Reload
-; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    pushl 72(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    calll _bar
@@ -116,53 +116,53 @@ define void @test2(ptr nocapture %x, i32 %y, ptr %z) nounwind {
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $80, %esp
 ; CHECK-NEXT:    movl %esp, %esi
-; CHECK-NEXT:    movl 12(%ebp), %edi
-; CHECK-NEXT:    movl 8(%ebp), %eax
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:    movl 84(%eax), %edx
-; CHECK-NEXT:    movl 80(%eax), %ebx
-; CHECK-NEXT:    movl 76(%eax), %ecx
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl 84(%eax), %ecx
 ; CHECK-NEXT:    movl %ecx, 68(%esi) ## 4-byte Spill
-; CHECK-NEXT:    movl 72(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 64(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl 80(%eax), %edx
+; CHECK-NEXT:    movl 76(%eax), %edi
+; CHECK-NEXT:    movl 72(%eax), %ebx
 ; CHECK-NEXT:    movl 68(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 60(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 64(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 64(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 56(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 60(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 60(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 52(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 56(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 56(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 48(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 52(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 52(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 44(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 48(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 48(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 40(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 44(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 44(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 36(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 40(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 40(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 32(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 36(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 36(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 28(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 32(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 32(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 24(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 28(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 28(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 20(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 24(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 24(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 16(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 20(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 20(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 12(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 16(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 16(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 8(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 12(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 12(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, 4(%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 8(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 8(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, (%esi) ## 4-byte Spill
+; CHECK-NEXT:    movl %ecx, 4(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl (%eax), %ecx
 ; CHECK-NEXT:    movl %ecx, 72(%esi) ## 4-byte Spill
 ; CHECK-NEXT:    movl 4(%eax), %eax
+; CHECK-NEXT:    movl 16(%ebp), %ecx
+; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %edx
+; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %ebx
-; CHECK-NEXT:    pushl 68(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 64(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 60(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 56(%esi) ## 4-byte Folded Reload
@@ -179,16 +179,16 @@ define void @test2(ptr nocapture %x, i32 %y, ptr %z) nounwind {
 ; CHECK-NEXT:    pushl 12(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 8(%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl 4(%esi) ## 4-byte Folded Reload
-; CHECK-NEXT:    pushl (%esi) ## 4-byte Folded Reload
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl 72(%esi) ## 4-byte Folded Reload
-; CHECK-NEXT:    pushl 16(%ebp)
+; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    calll _bar
 ; CHECK-NEXT:    addl $96, %esp
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    addl $15, %edi
-; CHECK-NEXT:    andl $-16, %edi
-; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    addl $15, %ecx
+; CHECK-NEXT:    andl $-16, %ecx
+; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    movl %eax, %esp
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    pushl %eax
@@ -215,11 +215,11 @@ define void @test3(ptr nocapture %x, i32 %y, ptr %z) nounwind {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $112, %esp
-; CHECK-NEXT:    movl 16(%ebp), %eax
-; CHECK-NEXT:    movl 8(%ebp), %esi
 ; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl 8(%ebp), %esi
 ; CHECK-NEXT:    movl $22, %ecx
 ; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
+; CHECK-NEXT:    movl 16(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    calll _bar
 ; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/stack-align.ll b/llvm/test/CodeGen/X86/stack-align.ll
index 334a81787b84b..61799cea5e534 100644
--- a/llvm/test/CodeGen/X86/stack-align.ll
+++ b/llvm/test/CodeGen/X86/stack-align.ll
@@ -14,14 +14,14 @@ target triple = "i686-apple-darwin8"
 define void @test(ptr byval({ double, double })  %z, ptr %P) nounwind {
 ; CHECK-LABEL: test:
 ; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [NaN,NaN]
+; CHECK-NEXT:    movsd _G, %xmm1 ## xmm1 = mem[0],zero
+; CHECK-NEXT:    andpd %xmm0, %xmm1
 ; CHECK-NEXT:    movl 20(%esp), %eax
-; CHECK-NEXT:    movsd _G, %xmm0 ## xmm0 = mem[0],zero
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [NaN,NaN]
-; CHECK-NEXT:    andpd %xmm1, %xmm0
-; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    movlpd %xmm1, (%eax)
 ; CHECK-NEXT:    movsd 4(%esp), %xmm2 ## xmm2 = mem[0],zero
-; CHECK-NEXT:    andpd %xmm1, %xmm2
-; CHECK-NEXT:    addsd %xmm0, %xmm2
+; CHECK-NEXT:    andpd %xmm0, %xmm2
+; CHECK-NEXT:    addsd %xmm1, %xmm2
 ; CHECK-NEXT:    movsd %xmm2, (%eax)
 ; CHECK-NEXT:    retl
 entry:
@@ -56,12 +56,11 @@ define <2 x double> @test3(<2 x double> %x, <2 x double> %y) alignstack(32) noun
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-32, %esp
-; CHECK-NEXT:    subl $64, %esp
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    mulpd %xmm1, %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%esp) ## 16-byte Spill
 ; CHECK-NEXT:    calll _test2
-; CHECK-NEXT:    movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; CHECK-NEXT:    mulpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; CHECK-NEXT:    movaps (%esp), %xmm0 ## 16-byte Reload
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
index b7d1b593d1ab6..fa248de7e04e5 100644
--- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
@@ -39,8 +39,8 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 {
 ; CHECK-X86-32-NEXT:    movl %esp, %ebp
 ; CHECK-X86-32-NEXT:    .cfi_def_cfa_register %ebp
 ; CHECK-X86-32-NEXT:    subl $8, %esp
-; CHECK-X86-32-NEXT:    movl 8(%ebp), %ecx
 ; CHECK-X86-32-NEXT:    movl %esp, %eax
+; CHECK-X86-32-NEXT:    movl 8(%ebp), %ecx
 ; CHECK-X86-32-NEXT:    leal 15(,%ecx,4), %ecx
 ; CHECK-X86-32-NEXT:    andl $-16, %ecx
 ; CHECK-X86-32-NEXT:    subl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll
index b5b9ce95a46ba..e82eda75924a1 100644
--- a/llvm/test/CodeGen/X86/stack-clash-large.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-large.ll
@@ -115,14 +115,12 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ;
 ; CHECK-X86-LABEL: push_before_probe:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X86-NEXT:    pushl %edx
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X86-NEXT:    pushl %ecx
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-X86-NEXT:    pushl %eax
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-X86-NEXT:    movl %esp, %eax
 ; CHECK-X86-NEXT:    subl $69632, %eax # imm = 0x11000
 ; CHECK-X86-NEXT:    .cfi_def_cfa_register %eax
@@ -133,35 +131,32 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X86-NEXT:    cmpl %eax, %esp
 ; CHECK-X86-NEXT:    jne .LBB1_1
 ; CHECK-X86-NEXT:  # %bb.2:
-; CHECK-X86-NEXT:    subl $2380, %esp # imm = 0x94C
+; CHECK-X86-NEXT:    subl $2368, %esp # imm = 0x940
 ; CHECK-X86-NEXT:    .cfi_def_cfa_register %esp
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 72032
-; CHECK-X86-NEXT:    .cfi_offset %eax, -20
-; CHECK-X86-NEXT:    .cfi_offset %ecx, -16
-; CHECK-X86-NEXT:    .cfi_offset %edx, -12
-; CHECK-X86-NEXT:    .cfi_offset %esi, -8
-; CHECK-X86-NEXT:    movl 72056(%esp), %eax
-; CHECK-X86-NEXT:    movl 72048(%esp), %edx
-; CHECK-X86-NEXT:    movl 72040(%esp), %ecx
-; CHECK-X86-NEXT:    movl 72032(%esp), %esi
-; CHECK-X86-NEXT:    addl 72036(%esp), %esi
-; CHECK-X86-NEXT:    addl 72044(%esp), %ecx
-; CHECK-X86-NEXT:    addl %esi, %ecx
-; CHECK-X86-NEXT:    addl 72052(%esp), %edx
-; CHECK-X86-NEXT:    addl 72060(%esp), %eax
-; CHECK-X86-NEXT:    addl %edx, %eax
-; CHECK-X86-NEXT:    addl %ecx, %eax
-; CHECK-X86-NEXT:    movl %eax, 392(%esp)
-; CHECK-X86-NEXT:    movl %eax, 28792(%esp)
-; CHECK-X86-NEXT:    addl $72012, %esp # imm = 0x1194C
-; CHECK-X86-NEXT:    .cfi_def_cfa_offset 20
-; CHECK-X86-NEXT:    popl %eax
+; CHECK-X86-NEXT:    .cfi_def_cfa_offset 72016
+; CHECK-X86-NEXT:    .cfi_offset %eax, -16
+; CHECK-X86-NEXT:    .cfi_offset %ecx, -12
+; CHECK-X86-NEXT:    .cfi_offset %edx, -8
+; CHECK-X86-NEXT:    movl 72040(%esp), %eax
+; CHECK-X86-NEXT:    addl 72044(%esp), %eax
+; CHECK-X86-NEXT:    movl 72032(%esp), %ecx
+; CHECK-X86-NEXT:    addl 72036(%esp), %ecx
+; CHECK-X86-NEXT:    addl %eax, %ecx
+; CHECK-X86-NEXT:    movl 72024(%esp), %eax
+; CHECK-X86-NEXT:    addl 72028(%esp), %eax
+; CHECK-X86-NEXT:    movl 72016(%esp), %edx
+; CHECK-X86-NEXT:    addl 72020(%esp), %edx
+; CHECK-X86-NEXT:    addl %eax, %edx
+; CHECK-X86-NEXT:    addl %ecx, %edx
+; CHECK-X86-NEXT:    movl %edx, 392(%esp)
+; CHECK-X86-NEXT:    movl %edx, 28792(%esp)
+; CHECK-X86-NEXT:    addl $72000, %esp # imm = 0x11940
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-X86-NEXT:    popl %ecx
+; CHECK-X86-NEXT:    popl %eax
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-X86-NEXT:    popl %edx
+; CHECK-X86-NEXT:    popl %ecx
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-X86-NEXT:    popl %esi
+; CHECK-X86-NEXT:    popl %edx
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
index 74fe07e88aa33..e5c355a007ec0 100644
--- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
+++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll
@@ -16,10 +16,10 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
 ; I686-NEXT:    subl $24, %esp
 ; I686-NEXT:    movl %esp, -28(%ebp)
 ; I686-NEXT:    movl $-1, -16(%ebp)
-; I686-NEXT:    leal -24(%ebp), %eax
 ; I686-NEXT:    movl $___ehhandler$pr66984, -20(%ebp)
-; I686-NEXT:    movl %fs:0, %ecx
-; I686-NEXT:    movl %ecx, -24(%ebp)
+; I686-NEXT:    movl %fs:0, %eax
+; I686-NEXT:    movl %eax, -24(%ebp)
+; I686-NEXT:    leal -24(%ebp), %eax
 ; I686-NEXT:    movl %eax, %fs:0
 ; I686-NEXT:    movl $1, -16(%ebp)
 ; I686-NEXT:    calll _throw
@@ -48,8 +48,8 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
 ; I686-NEXT:    pushl %ebp
 ; I686-NEXT:    addl $12, %ebp
 ; I686-NEXT:    movl %esp, -28(%ebp)
-; I686-NEXT:    movl -36(%ebp), %ecx
 ; I686-NEXT:    movl $2, -16(%ebp)
+; I686-NEXT:    movl -36(%ebp), %ecx
 ; I686-NEXT:    calll _cleanup
 ; I686-NEXT:    movl $LBB0_3, %eax
 ; I686-NEXT:    popl %ebp
@@ -82,10 +82,10 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 {
 ; X86_64-NEXT:    .seh_endprologue
 ; X86_64-NEXT:    movq $-2, -16(%rbp)
 ; X86_64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X86_64-NEXT:  .Ltmp0:
+; X86_64-NEXT:  .Ltmp0: # EH_LABEL
 ; X86_64-NEXT:    callq throw
 ; X86_64-NEXT:    nop
-; X86_64-NEXT:  .Ltmp1:
+; X86_64-NEXT:  .Ltmp1: # EH_LABEL
 ; X86_64-NEXT:  # %bb.1: # %bb14
 ; X86_64-NEXT:  .LBB0_3: # Block address taken
 ; X86_64-NEXT:    # %exit
diff --git a/llvm/test/CodeGen/X86/stack-protector-msvc.ll b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
index 3109733e0b0b7..d731820e230c8 100644
--- a/llvm/test/CodeGen/X86/stack-protector-msvc.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
@@ -149,10 +149,10 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl %ebp
 ; MSVC-X86-NEXT:    movl %esp, %ebp
 ; MSVC-X86-NEXT:    pushl %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-NEXT:    xorl %ebp, %eax
+; MSVC-X86-NEXT:    movl %eax, -4(%ebp)
 ; MSVC-X86-NEXT:    movl 8(%ebp), %eax
-; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
-; MSVC-X86-NEXT:    xorl %ebp, %ecx
-; MSVC-X86-NEXT:    movl %ecx, -4(%ebp)
 ; MSVC-X86-NEXT:    shll $2, %eax
 ; MSVC-X86-NEXT:    calll __chkstk
 ; MSVC-X86-NEXT:    movl %esp, %eax
@@ -288,10 +288,10 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X86-NEXT:    andl $-32, %esp
 ; MSVC-X86-NEXT:    subl $32, %esp
 ; MSVC-X86-NEXT:    movl %esp, %esi
+; MSVC-X86-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-NEXT:    xorl %ebp, %eax
+; MSVC-X86-NEXT:    movl %eax, 12(%esi)
 ; MSVC-X86-NEXT:    movl 8(%ebp), %eax
-; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
-; MSVC-X86-NEXT:    xorl %ebp, %ecx
-; MSVC-X86-NEXT:    movl %ecx, 12(%esi)
 ; MSVC-X86-NEXT:    shll $2, %eax
 ; MSVC-X86-NEXT:    calll __chkstk
 ; MSVC-X86-NEXT:    movl %esp, %edi
diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll
index 97e31b3fa422b..8aa28888ac57a 100644
--- a/llvm/test/CodeGen/X86/store-narrow.ll
+++ b/llvm/test/CodeGen/X86/store-narrow.ll
@@ -188,19 +188,19 @@ define i32 @test7(ptr nocapture %a0, i8 zeroext %a1, ptr %P2) nounwind {
 ;
 ; X86-BWON-LABEL: test7:
 ; X86-BWON:       ## %bb.0: ## %entry
-; X86-BWON-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-BWON-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BWON-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BWON-NEXT:    movl (%eax), %eax
+; X86-BWON-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BWON-NEXT:    movb %cl, 5(%edx)
 ; X86-BWON-NEXT:    retl
 ;
 ; X86-BWOFF-LABEL: test7:
 ; X86-BWOFF:       ## %bb.0: ## %entry
-; X86-BWOFF-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BWOFF-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BWOFF-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BWOFF-NEXT:    movl (%eax), %eax
+; X86-BWOFF-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BWOFF-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BWOFF-NEXT:    movb %cl, 5(%edx)
 ; X86-BWOFF-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
index 23ab291e6c11a..990c6ba954b43 100644
--- a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
+++ b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
@@ -179,10 +179,10 @@ entry:
 define void @volatile_zero_64(ptr %p) minsize {
 ; CHECK32-LABEL: volatile_zero_64:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    xorl %ecx, %ecx
-; CHECK32-NEXT:    movl %ecx, 4(%eax)
-; CHECK32-NEXT:    movl %ecx, (%eax)
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl %eax, 4(%ecx)
+; CHECK32-NEXT:    movl %eax, (%ecx)
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: volatile_zero_64:
@@ -230,11 +230,11 @@ entry:
 define void @volatile_minus_one_64(ptr %p) minsize {
 ; CHECK32-LABEL: volatile_minus_one_64:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    xorl %ecx, %ecx
-; CHECK32-NEXT:    decl %ecx
-; CHECK32-NEXT:    movl %ecx, 4(%eax)
-; CHECK32-NEXT:    movl %ecx, (%eax)
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    decl %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl %eax, 4(%ecx)
+; CHECK32-NEXT:    movl %eax, (%ecx)
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: volatile_minus_one_64:
diff --git a/llvm/test/CodeGen/X86/storetrunc-fp.ll b/llvm/test/CodeGen/X86/storetrunc-fp.ll
index d1e7b1800459e..7c00bdd183203 100644
--- a/llvm/test/CodeGen/X86/storetrunc-fp.ll
+++ b/llvm/test/CodeGen/X86/storetrunc-fp.ll
@@ -4,10 +4,10 @@
 define void @foo(x86_fp80 %a, x86_fp80 %b, ptr %fp) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    faddp %st, %st(1)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    fstps (%eax)
 ; CHECK-NEXT:    retl
 	%c = fadd x86_fp80 %a, %b
diff --git a/llvm/test/CodeGen/X86/strict-fsub-combines.ll b/llvm/test/CodeGen/X86/strict-fsub-combines.ll
index 774ea02ccd87a..04c149248fd73 100644
--- a/llvm/test/CodeGen/X86/strict-fsub-combines.ll
+++ b/llvm/test/CodeGen/X86/strict-fsub-combines.ll
@@ -8,10 +8,10 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    subss %xmm1, %xmm0
-; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    subss %xmm0, %xmm1
+; X86-NEXT:    movss %xmm1, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    popl %eax
@@ -36,10 +36,10 @@ define double @fneg_strict_fsub_to_strict_fadd_d(double %x, double %y) nounwind
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    xorpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    subsd %xmm1, %xmm0
-; X86-NEXT:    movsd %xmm0, (%esp)
+; X86-NEXT:    subsd %xmm0, %xmm1
+; X86-NEXT:    movsd %xmm1, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    wait
 ; X86-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 39cbee54737c3..3187b48897dd9 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -69,9 +69,9 @@ define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs,
 ; X86-NEXT:    jmp .LBB2_3
 ; X86-NEXT:  .LBB2_2: # %compare
 ; X86-NEXT:    movdqa %xmm0, (%esp)
+; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    andl $15, %ecx
 ; X86-NEXT:    movzbl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subb 16(%esp,%ecx), %al
 ; X86-NEXT:  .LBB2_3: # %exit
 ; X86-NEXT:    movzbl %al, %eax
@@ -119,15 +119,13 @@ exit:
 define i1 @pcmpestri_mem_eq_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
 ; X86-LABEL: pcmpestri_mem_eq_i8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movdqu (%esi), %xmm0
 ; X86-NEXT:    pcmpestri $24, (%ecx), %xmm0
 ; X86-NEXT:    setae %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestri_mem_eq_i8:
@@ -150,15 +148,13 @@ entry:
 define i32 @pcmpestri_mem_idx_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
 ; X86-LABEL: pcmpestri_mem_idx_i8:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movdqu (%esi), %xmm0
 ; X86-NEXT:    pcmpestri $24, (%ecx), %xmm0
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestri_mem_idx_i8:
@@ -182,15 +178,14 @@ define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm1
 ; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movdqu (%esi), %xmm1
-; X86-NEXT:    movdqu (%ecx), %xmm0
 ; X86-NEXT:    pcmpestri $24, %xmm0, %xmm1
 ; X86-NEXT:    cmpl $16, %ecx
 ; X86-NEXT:    jne .LBB5_2
@@ -199,14 +194,13 @@ define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
 ; X86-NEXT:    jmp .LBB5_3
 ; X86-NEXT:  .LBB5_2: # %compare
 ; X86-NEXT:    movdqa %xmm1, (%esp)
+; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    andl $15, %ecx
 ; X86-NEXT:    movzbl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subb 16(%esp,%ecx), %al
 ; X86-NEXT:  .LBB5_3: # %exit
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -315,10 +309,10 @@ define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs,
 ; X86-NEXT:    jmp .LBB8_3
 ; X86-NEXT:  .LBB8_2: # %compare
 ; X86-NEXT:    movdqa %xmm0, (%esp)
+; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    andl $14, %ecx
 ; X86-NEXT:    movzwl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subw 16(%esp,%ecx), %ax
 ; X86-NEXT:  .LBB8_3: # %exit
 ; X86-NEXT:    movzwl %ax, %eax
@@ -369,15 +363,13 @@ exit:
 define i1 @pcmpestri_mem_eq_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
 ; X86-LABEL: pcmpestri_mem_eq_i16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movdqu (%esi), %xmm0
 ; X86-NEXT:    pcmpestri $25, (%ecx), %xmm0
 ; X86-NEXT:    setae %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestri_mem_eq_i16:
@@ -402,15 +394,13 @@ entry:
 define i32 @pcmpestri_mem_idx_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
 ; X86-LABEL: pcmpestri_mem_idx_i16:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movdqu (%esi), %xmm0
 ; X86-NEXT:    pcmpestri $25, (%ecx), %xmm0
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestri_mem_idx_i16:
@@ -436,15 +426,14 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm1
 ; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 16(%ebp), %ecx
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movdqu (%esi), %xmm1
-; X86-NEXT:    movdqu (%ecx), %xmm0
 ; X86-NEXT:    pcmpestri $25, %xmm0, %xmm1
 ; X86-NEXT:    cmpl $8, %ecx
 ; X86-NEXT:    jne .LBB11_2
@@ -453,15 +442,14 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
 ; X86-NEXT:    jmp .LBB11_3
 ; X86-NEXT:  .LBB11_2: # %compare
 ; X86-NEXT:    movdqa %xmm1, (%esp)
+; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    andl $14, %ecx
 ; X86-NEXT:    movzwl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subw 16(%esp,%ecx), %ax
 ; X86-NEXT:  .LBB11_3: # %exit
 ; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -560,9 +548,9 @@ define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movdqa %xmm0, (%esp)
+; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    andl $15, %ecx
 ; X86-NEXT:    movzbl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subb 16(%esp,%ecx), %al
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -608,8 +596,8 @@ define i1 @pcmpistri_mem_eq_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-LABEL: pcmpistri_mem_eq_i8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm0
+; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pcmpistri $24, (%eax), %xmm0
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
@@ -632,8 +620,8 @@ define i32 @pcmpistri_mem_idx_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-LABEL: pcmpistri_mem_idx_i8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm0
+; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pcmpistri $24, (%eax), %xmm0
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
@@ -659,9 +647,9 @@ define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm1
 ; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm1
 ; X86-NEXT:    pcmpistri $24, %xmm0, %xmm1
 ; X86-NEXT:    cmpl $16, %ecx
 ; X86-NEXT:    jne .LBB17_2
@@ -670,9 +658,9 @@ define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-NEXT:    jmp .LBB17_3
 ; X86-NEXT:  .LBB17_2: # %compare
 ; X86-NEXT:    movdqa %xmm1, (%esp)
+; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    andl $15, %ecx
 ; X86-NEXT:    movzbl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subb 16(%esp,%ecx), %al
 ; X86-NEXT:  .LBB17_3: # %exit
 ; X86-NEXT:    movzbl %al, %eax
@@ -774,10 +762,10 @@ define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movdqa %xmm0, (%esp)
+; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    andl $14, %ecx
 ; X86-NEXT:    movzwl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subw 16(%esp,%ecx), %ax
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -826,8 +814,8 @@ define i1 @pcmpistri_mem_eq_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-LABEL: pcmpistri_mem_eq_i16:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm0
+; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pcmpistri $25, (%eax), %xmm0
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
@@ -852,8 +840,8 @@ define i32 @pcmpistri_mem_idx_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-LABEL: pcmpistri_mem_idx_i16:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm0
+; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pcmpistri $25, (%eax), %xmm0
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
@@ -881,9 +869,9 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm1
 ; X86-NEXT:    movdqu (%eax), %xmm0
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movdqu (%eax), %xmm1
 ; X86-NEXT:    pcmpistri $25, %xmm0, %xmm1
 ; X86-NEXT:    cmpl $8, %ecx
 ; X86-NEXT:    jne .LBB23_2
@@ -892,10 +880,10 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
 ; X86-NEXT:    jmp .LBB23_3
 ; X86-NEXT:  .LBB23_2: # %compare
 ; X86-NEXT:    movdqa %xmm1, (%esp)
+; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    andl $14, %ecx
 ; X86-NEXT:    movzwl (%esp,%ecx), %eax
-; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    subw 16(%esp,%ecx), %ax
 ; X86-NEXT:  .LBB23_3: # %exit
 ; X86-NEXT:    movzwl %ax, %eax
@@ -949,19 +937,17 @@ define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i
 ; X86-LABEL: pcmpestr_index_flag:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X86-NEXT:    movl %ecx, (%esi)
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ecx, (%edi)
-; X86-NEXT:    movl %ebx, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
@@ -988,19 +974,17 @@ entry:
 define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %fptr) nounwind {
 ; X86-LABEL: pcmpestr_mask_flag:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
-; X86-NEXT:    setb %bl
 ; X86-NEXT:    movdqa %xmm0, (%esi)
-; X86-NEXT:    movl %ebx, (%ecx)
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestr_mask_flag:
@@ -1025,19 +1009,17 @@ entry:
 define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %iptr) nounwind {
 ; X86-LABEL: pcmpestr_mask_index:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movdqa %xmm0, %xmm2
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    pcmpestri $24, %xmm1, %xmm2
-; X86-NEXT:    movdqa %xmm0, (%edi)
-; X86-NEXT:    movl %ecx, (%esi)
+; X86-NEXT:    movdqa %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestr_mask_index:
@@ -1063,27 +1045,23 @@ entry:
 define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
 ; X86-LABEL: pcmpestr_mask_index_flag:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movdqa %xmm0, %xmm2
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    pcmpestri $24, %xmm1, %xmm2
+; X86-NEXT:    movdqa %xmm0, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movdqa %xmm0, (%ebp)
-; X86-NEXT:    movl %ecx, (%edi)
-; X86-NEXT:    movl %ebx, (%esi)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpestr_mask_index_flag:
@@ -1114,15 +1092,13 @@ entry:
 define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %iptr, ptr %fptr) nounwind {
 ; X86-LABEL: pcmpistr_index_flag:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %ecx, (%esi)
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpistr_index_flag:
@@ -1145,11 +1121,11 @@ define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %
 ; X86-LABEL: pcmpistr_mask_flag:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
+; X86-NEXT:    movdqa %xmm0, (%ecx)
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movdqa %xmm0, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
@@ -1172,11 +1148,12 @@ entry:
 define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr) nounwind {
 ; X86-LABEL: pcmpistr_mask_index:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm2
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
 ; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
-; X86-NEXT:    movdqa %xmm0, (%edx)
+; X86-NEXT:    movdqa %xmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pcmpistri $24, %xmm1, %xmm2
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -1198,21 +1175,17 @@ entry:
 define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
 ; X86-LABEL: pcmpistr_mask_index_flag:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movdqa %xmm0, %xmm2
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movdqa %xmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    pcmpistri $24, %xmm1, %xmm2
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movdqa %xmm0, (%esi)
 ; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpistr_mask_index_flag:
@@ -1240,23 +1213,19 @@ entry:
 define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, ptr %rhsptr, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
 ; X86-LABEL: pcmpistr_mask_index_flag_load:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqu (%ecx), %xmm2
+; X86-NEXT:    movdqu (%eax), %xmm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pcmpistrm $24, %xmm2, %xmm0
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    movdqa %xmm0, (%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    pcmpistri $24, %xmm2, %xmm1
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movdqa %xmm0, (%esi)
 ; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pcmpistr_mask_index_flag_load:
@@ -1289,9 +1258,9 @@ define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, ptr %rhsptr, i32
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movntdqa (%eax), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntdqa (%ecx), %xmm1
 ; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X86-NEXT:    setb %bl
diff --git a/llvm/test/CodeGen/X86/sub-of-bias.ll b/llvm/test/CodeGen/X86/sub-of-bias.ll
index f9454c4e7d23e..3e2d5aa332a85 100644
--- a/llvm/test/CodeGen/X86/sub-of-bias.ll
+++ b/llvm/test/CodeGen/X86/sub-of-bias.ll
@@ -116,12 +116,12 @@ define i32 @t2_commutative(i32 %ptr, i32 %mask) nounwind {
 define i32 @n3_extrause1(i32 %ptr, i32 %mask, ptr %bias_storage) nounwind {
 ; X86-LABEL: n3_extrause1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: n3_extrause1:
@@ -142,9 +142,9 @@ define i32 @n3_extrause1(i32 %ptr, i32 %mask, ptr %bias_storage) nounwind {
 define i32 @n4_different_ptrs(i32 %ptr0, i32 %ptr1, i32 %mask) nounwind {
 ; X86-LABEL: n4_different_ptrs:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -161,9 +161,9 @@ define i32 @n4_different_ptrs(i32 %ptr0, i32 %ptr1, i32 %mask) nounwind {
 define i32 @n5_different_ptrs_commutative(i32 %ptr0, i32 %ptr1, i32 %mask) nounwind {
 ; X86-LABEL: n5_different_ptrs_commutative:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll
index 75333bf835f89..8d61603c11ccb 100644
--- a/llvm/test/CodeGen/X86/subvector-broadcast.ll
+++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll
@@ -510,16 +510,16 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) {
 ; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -544,16 +544,16 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) {
 ; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -578,16 +578,16 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) {
 ; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -612,16 +612,16 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) {
 ; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -646,16 +646,16 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind {
 ; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -680,16 +680,16 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind {
 ; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -718,18 +718,18 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(ptr %p0, ptr %p1) {
 ; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -756,9 +756,9 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) {
 ; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
 ; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X86-AVX-NEXT:    retl
@@ -766,9 +766,9 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) {
 ; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
 ; X86-AVX512-NEXT:    retl
 ;
@@ -803,25 +803,25 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) {
 define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
 ; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
 ; X86-AVX1:       # %bb.0: # %entry
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm4 = [3,4]
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vmovdqu %xmm3, ga4+16
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
-; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm5 = [3,4]
-; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm6
+; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqu %xmm0, ga4
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, gb4+32
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; X86-AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; X86-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; X86-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X86-AVX1-NEXT:    vmovdqu %xmm0, ga4+16
-; X86-AVX1-NEXT:    vmovdqu %xmm4, ga4
-; X86-AVX1-NEXT:    vmovups %ymm2, gb4+32
-; X86-AVX1-NEXT:    vmovups %ymm1, gb4
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, gb4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
@@ -829,13 +829,13 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64>
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4]
 ; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
-; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, ga4
-; X86-AVX2-NEXT:    vmovdqu %ymm2, gb4+32
-; X86-AVX2-NEXT:    vmovdqu %ymm1, gb4
+; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovdqu %ymm0, gb4+32
+; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovdqu %ymm0, gb4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -843,11 +843,11 @@ define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64>
 ; X86-AVX512:       # %bb.0: # %entry
 ; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
 ; X86-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; X86-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; X86-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
-; X86-AVX512-NEXT:    vmovdqu %ymm0, ga4
 ; X86-AVX512-NEXT:    vmovdqu64 %zmm1, gb4
+; X86-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vmovdqu %ymm0, ga4
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
 ;
@@ -918,13 +918,13 @@ define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x d
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X86-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
-; X86-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
-; X86-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
-; X86-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
-; X86-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
 ; X86-AVX-NEXT:    vmovupd %ymm0, ga2
-; X86-AVX-NEXT:    vmovupd %ymm2, gb2+32
-; X86-AVX-NEXT:    vmovupd %ymm1, gb2
+; X86-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm0
+; X86-AVX-NEXT:    vdivpd %ymm3, %ymm0, %ymm0
+; X86-AVX-NEXT:    vmovupd %ymm0, gb2+32
+; X86-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm0
+; X86-AVX-NEXT:    vdivpd %ymm3, %ymm0, %ymm0
+; X86-AVX-NEXT:    vmovupd %ymm0, gb2
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
 ;
@@ -932,11 +932,11 @@ define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x d
 ; X86-AVX512:       # %bb.0: # %entry
 ; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X86-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
-; X86-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
 ; X86-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
-; X86-AVX512-NEXT:    vmovupd %ymm0, ga2
 ; X86-AVX512-NEXT:    vmovupd %zmm1, gb2
+; X86-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vmovupd %ymm0, ga2
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
 ;
@@ -988,24 +988,24 @@ define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x
 ; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
 ; X86-AVX1-NEXT:    # ymm3 = mem[0,1,0,1]
 ; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
-; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; X86-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
-; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; X86-AVX1-NEXT:    vpaddd 8(%ebp), %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpaddd 24(%ebp), %xmm3, %xmm5
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; X86-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X86-AVX1-NEXT:    vandps %ymm3, %ymm4, %ymm3
 ; X86-AVX1-NEXT:    vmovdqu %xmm0, ha4
-; X86-AVX1-NEXT:    vmovups %ymm1, hb4
-; X86-AVX1-NEXT:    vmovups %ymm3, hc4+32
-; X86-AVX1-NEXT:    vmovups %ymm2, hc4
+; X86-AVX1-NEXT:    vpaddd 8(%ebp), %xmm3, %xmm0
+; X86-AVX1-NEXT:    vpaddd 24(%ebp), %xmm3, %xmm4
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, hc4+32
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
+; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, hc4
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, hb4
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1019,17 +1019,17 @@ define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x
 ; X86-AVX2-NEXT:    subl $32, %esp
 ; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
 ; X86-AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
-; X86-AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpaddd 8(%ebp), %ymm3, %ymm4
+; X86-AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm4
+; X86-AVX2-NEXT:    vmovdqu %ymm4, hc4+32
+; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; X86-AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
-; X86-AVX2-NEXT:    vmovdqu %xmm0, ha4
-; X86-AVX2-NEXT:    vmovdqu %ymm1, hb4
-; X86-AVX2-NEXT:    vmovdqu %ymm3, hc4+32
 ; X86-AVX2-NEXT:    vmovdqu %ymm2, hc4
+; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vmovdqu %ymm1, hb4
+; X86-AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqu %xmm0, ha4
 ; X86-AVX2-NEXT:    movl %ebp, %esp
 ; X86-AVX2-NEXT:    popl %ebp
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1040,13 +1040,13 @@ define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x
 ; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
 ; X86-AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; X86-AVX512-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; X86-AVX512-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; X86-AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; X86-AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
-; X86-AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm2
 ; X86-AVX512-NEXT:    vmovdqu %xmm0, ha4
-; X86-AVX512-NEXT:    vmovdqu %ymm1, hb4
-; X86-AVX512-NEXT:    vmovdqu64 %zmm2, hc4
+; X86-AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
+; X86-AVX512-NEXT:    vpandd %zmm3, %zmm0, %zmm0
+; X86-AVX512-NEXT:    vmovdqu64 %zmm0, hc4
+; X86-AVX512-NEXT:    vpaddd %ymm3, %ymm1, %ymm0
+; X86-AVX512-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vmovdqu %ymm0, hb4
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 5699c447baf41..fa0a3381a0502 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -41,9 +41,9 @@ define float @foo(ptr swifterror %error_ptr_ref) {
 ; CHECK-i386-NEXT:    subl $8, %esp
 ; CHECK-i386-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-i386-NEXT:    .cfi_offset %esi, -8
-; CHECK-i386-NEXT:    movl 16(%esp), %esi
 ; CHECK-i386-NEXT:    movl $0, 4(%esp)
 ; CHECK-i386-NEXT:    movl $16, (%esp)
+; CHECK-i386-NEXT:    movl 16(%esp), %esi
 ; CHECK-i386-NEXT:    calll _malloc
 ; CHECK-i386-NEXT:    movl %eax, (%esi)
 ; CHECK-i386-NEXT:    movb $1, 8(%eax)
@@ -568,31 +568,27 @@ define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror
 ;
 ; CHECK-i386-LABEL: foo_sret:
 ; CHECK-i386:       ## %bb.0: ## %entry
-; CHECK-i386-NEXT:    pushl %ebx
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-i386-NEXT:    pushl %edi
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-i386-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-i386-NEXT:    pushl %esi
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-i386-NEXT:    subl $16, %esp
+; CHECK-i386-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-i386-NEXT:    subl $20, %esp
 ; CHECK-i386-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-i386-NEXT:    .cfi_offset %esi, -16
-; CHECK-i386-NEXT:    .cfi_offset %edi, -12
-; CHECK-i386-NEXT:    .cfi_offset %ebx, -8
+; CHECK-i386-NEXT:    .cfi_offset %esi, -12
+; CHECK-i386-NEXT:    .cfi_offset %edi, -8
 ; CHECK-i386-NEXT:    movl 32(%esp), %esi
-; CHECK-i386-NEXT:    movl 36(%esp), %edi
-; CHECK-i386-NEXT:    movl 40(%esp), %ebx
 ; CHECK-i386-NEXT:    movl $0, 4(%esp)
 ; CHECK-i386-NEXT:    movl $16, (%esp)
+; CHECK-i386-NEXT:    movl 40(%esp), %edi
 ; CHECK-i386-NEXT:    calll _malloc
-; CHECK-i386-NEXT:    movl %eax, (%ebx)
+; CHECK-i386-NEXT:    movl %eax, (%edi)
 ; CHECK-i386-NEXT:    movb $1, 8(%eax)
-; CHECK-i386-NEXT:    movl %edi, 4(%esi)
+; CHECK-i386-NEXT:    movl 36(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 4(%esi)
 ; CHECK-i386-NEXT:    movl %esi, %eax
-; CHECK-i386-NEXT:    addl $16, %esp
+; CHECK-i386-NEXT:    addl $20, %esp
 ; CHECK-i386-NEXT:    popl %esi
 ; CHECK-i386-NEXT:    popl %edi
-; CHECK-i386-NEXT:    popl %ebx
 ; CHECK-i386-NEXT:    retl $4
 
 ; spill sret to stack
@@ -1055,9 +1051,9 @@ define swiftcc float @foo_swiftcc(ptr swifterror %error_ptr_ref) {
 ; CHECK-i386-NEXT:    subl $8, %esp
 ; CHECK-i386-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-i386-NEXT:    .cfi_offset %esi, -8
-; CHECK-i386-NEXT:    movl 16(%esp), %esi
 ; CHECK-i386-NEXT:    movl $0, 4(%esp)
 ; CHECK-i386-NEXT:    movl $16, (%esp)
+; CHECK-i386-NEXT:    movl 16(%esp), %esi
 ; CHECK-i386-NEXT:    calll _malloc
 ; CHECK-i386-NEXT:    movl %eax, (%esi)
 ; CHECK-i386-NEXT:    movb $1, 8(%eax)
@@ -1456,25 +1452,9 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, ptr swiftself,
 ;
 ; CHECK-i386-LABEL: params_in_reg:
 ; CHECK-i386:       ## %bb.0:
-; CHECK-i386-NEXT:    pushl %ebp
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-i386-NEXT:    pushl %ebx
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 12
-; CHECK-i386-NEXT:    pushl %edi
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-i386-NEXT:    pushl %esi
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 20
 ; CHECK-i386-NEXT:    subl $60, %esp
-; CHECK-i386-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-i386-NEXT:    .cfi_offset %esi, -20
-; CHECK-i386-NEXT:    .cfi_offset %edi, -16
-; CHECK-i386-NEXT:    .cfi_offset %ebx, -12
-; CHECK-i386-NEXT:    .cfi_offset %ebp, -8
+; CHECK-i386-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-i386-NEXT:    movl $0, 56(%esp)
-; CHECK-i386-NEXT:    movl 120(%esp), %ebx
-; CHECK-i386-NEXT:    movl 124(%esp), %ebp
-; CHECK-i386-NEXT:    movl 128(%esp), %esi
-; CHECK-i386-NEXT:    movl 132(%esp), %edi
 ; CHECK-i386-NEXT:    leal 56(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, 52(%esp)
 ; CHECK-i386-NEXT:    movl $0, 48(%esp)
@@ -1491,36 +1471,36 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, ptr swiftself,
 ; CHECK-i386-NEXT:    movl $0, 4(%esp)
 ; CHECK-i386-NEXT:    movl $1, (%esp)
 ; CHECK-i386-NEXT:    calll _params_in_reg2
-; CHECK-i386-NEXT:    movl %edi, 52(%esp)
-; CHECK-i386-NEXT:    movl %esi, 48(%esp)
-; CHECK-i386-NEXT:    movl %ebp, 44(%esp)
-; CHECK-i386-NEXT:    movl %ebx, 40(%esp)
 ; CHECK-i386-NEXT:    movl 116(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 36(%esp)
+; CHECK-i386-NEXT:    movl %eax, 52(%esp)
 ; CHECK-i386-NEXT:    movl 112(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 32(%esp)
+; CHECK-i386-NEXT:    movl %eax, 48(%esp)
 ; CHECK-i386-NEXT:    movl 108(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 28(%esp)
+; CHECK-i386-NEXT:    movl %eax, 44(%esp)
 ; CHECK-i386-NEXT:    movl 104(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 24(%esp)
+; CHECK-i386-NEXT:    movl %eax, 40(%esp)
 ; CHECK-i386-NEXT:    movl 100(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 20(%esp)
+; CHECK-i386-NEXT:    movl %eax, 36(%esp)
 ; CHECK-i386-NEXT:    movl 96(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 16(%esp)
+; CHECK-i386-NEXT:    movl %eax, 32(%esp)
 ; CHECK-i386-NEXT:    movl 92(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 12(%esp)
+; CHECK-i386-NEXT:    movl %eax, 28(%esp)
 ; CHECK-i386-NEXT:    movl 88(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 8(%esp)
+; CHECK-i386-NEXT:    movl %eax, 24(%esp)
 ; CHECK-i386-NEXT:    movl 84(%esp), %eax
-; CHECK-i386-NEXT:    movl %eax, 4(%esp)
+; CHECK-i386-NEXT:    movl %eax, 20(%esp)
 ; CHECK-i386-NEXT:    movl 80(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 16(%esp)
+; CHECK-i386-NEXT:    movl 76(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 12(%esp)
+; CHECK-i386-NEXT:    movl 72(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 8(%esp)
+; CHECK-i386-NEXT:    movl 68(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 4(%esp)
+; CHECK-i386-NEXT:    movl 64(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, (%esp)
 ; CHECK-i386-NEXT:    calll _params_in_reg2
 ; CHECK-i386-NEXT:    addl $60, %esp
-; CHECK-i386-NEXT:    popl %esi
-; CHECK-i386-NEXT:    popl %edi
-; CHECK-i386-NEXT:    popl %ebx
-; CHECK-i386-NEXT:    popl %ebp
 ; CHECK-i386-NEXT:    retl
   %error_ptr_ref = alloca swifterror ptr, align 8
   store ptr null, ptr %error_ptr_ref
@@ -1692,11 +1672,7 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-i386-NEXT:    .cfi_offset %edi, -16
 ; CHECK-i386-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-i386-NEXT:    .cfi_offset %ebp, -8
-; CHECK-i386-NEXT:    movl 148(%esp), %esi
 ; CHECK-i386-NEXT:    movl $0, 64(%esp)
-; CHECK-i386-NEXT:    movl 192(%esp), %ebx
-; CHECK-i386-NEXT:    movl 196(%esp), %ebp
-; CHECK-i386-NEXT:    movl 200(%esp), %edi
 ; CHECK-i386-NEXT:    leal 64(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, 52(%esp)
 ; CHECK-i386-NEXT:    movl $0, 48(%esp)
@@ -1713,9 +1689,12 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-i386-NEXT:    movl $0, 4(%esp)
 ; CHECK-i386-NEXT:    movl $1, (%esp)
 ; CHECK-i386-NEXT:    calll _params_in_reg2
-; CHECK-i386-NEXT:    movl %edi, 56(%esp)
-; CHECK-i386-NEXT:    movl %ebp, 52(%esp)
-; CHECK-i386-NEXT:    movl %ebx, 48(%esp)
+; CHECK-i386-NEXT:    movl 200(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 56(%esp)
+; CHECK-i386-NEXT:    movl 196(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 52(%esp)
+; CHECK-i386-NEXT:    movl 192(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 48(%esp)
 ; CHECK-i386-NEXT:    movl 188(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, 44(%esp)
 ; CHECK-i386-NEXT:    movl 184(%esp), %eax
@@ -1736,7 +1715,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-i386-NEXT:    movl %eax, 12(%esp)
 ; CHECK-i386-NEXT:    movl 152(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, 8(%esp)
-; CHECK-i386-NEXT:    movl %esi, 4(%esp)
+; CHECK-i386-NEXT:    movl 148(%esp), %eax
+; CHECK-i386-NEXT:    movl %eax, 4(%esp)
 ; CHECK-i386-NEXT:    leal 88(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, (%esp)
 ; CHECK-i386-NEXT:    calll _params_and_return_in_reg2
@@ -1751,8 +1731,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-i386-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-i386-NEXT:    movl 104(%esp), %ebp
 ; CHECK-i386-NEXT:    movl 108(%esp), %edi
-; CHECK-i386-NEXT:    movl 112(%esp), %esi
-; CHECK-i386-NEXT:    movl 116(%esp), %ebx
+; CHECK-i386-NEXT:    movl 112(%esp), %ebx
+; CHECK-i386-NEXT:    movl 116(%esp), %esi
 ; CHECK-i386-NEXT:    leal 64(%esp), %eax
 ; CHECK-i386-NEXT:    movl %eax, 52(%esp)
 ; CHECK-i386-NEXT:    movl $0, 48(%esp)
@@ -1770,8 +1750,8 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-i386-NEXT:    movl $1, (%esp)
 ; CHECK-i386-NEXT:    calll _params_in_reg2
 ; CHECK-i386-NEXT:    movl 144(%esp), %eax
-; CHECK-i386-NEXT:    movl %ebx, 28(%eax)
-; CHECK-i386-NEXT:    movl %esi, 24(%eax)
+; CHECK-i386-NEXT:    movl %esi, 28(%eax)
+; CHECK-i386-NEXT:    movl %ebx, 24(%eax)
 ; CHECK-i386-NEXT:    movl %edi, 20(%eax)
 ; CHECK-i386-NEXT:    movl %ebp, 16(%eax)
 ; CHECK-i386-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/tailcall-tailcc.ll b/llvm/test/CodeGen/X86/tailcall-tailcc.ll
index adb032a40d13b..7e6871f91d60a 100644
--- a/llvm/test/CodeGen/X86/tailcall-tailcc.ll
+++ b/llvm/test/CodeGen/X86/tailcall-tailcc.ll
@@ -20,10 +20,10 @@ define dso_local tailcc i32 @tailcaller(i32 %in1, i32 %in2) nounwind {
 ;
 ; UEFI64-LABEL: tailcaller:
 ; UEFI64:       # %bb.0: # %entry
-; UEFI64-NEXT:    subq	$40, %rsp
+; UEFI64-NEXT:    subq $40, %rsp
 ; UEFI64-NEXT:    movl %ecx, %r8d
 ; UEFI64-NEXT:    movl %edx, %r9d
-; UEFI64-NEXT:    addq	$40, %rsp
+; UEFI64-NEXT:    addq $40, %rsp
 ; UEFI64-NEXT:    jmp tailcallee # TAILCALL
 ;
 ; X86-LABEL: tailcaller:
@@ -131,28 +131,24 @@ define dso_local tailcc void @void_test(i32, i32, i32, i32) {
 ;
 ; UEFI64-LABEL: void_test:
 ; UEFI64:       # %bb.0: # %entry
-; UEFI64-NEXT:   subq	$40, %rsp
-; UEFI64-NEXT:   .seh_stackalloc 40
-; UEFI64-NEXT:   .seh_endprologue
-; UEFI64-NEXT:   .seh_startepilogue
-; UEFI64-NEXT:   addq	$40, %rsp
-; UEFI64-NEXT:   .seh_endepilogue
-; UEFI64-NEXT:   jmp void_test # TAILCALL
+; UEFI64-NEXT:    subq $40, %rsp
+; UEFI64-NEXT:    .seh_stackalloc 40
+; UEFI64-NEXT:    .seh_endprologue
+; UEFI64-NEXT:    .seh_startepilogue
+; UEFI64-NEXT:    addq $40, %rsp
+; UEFI64-NEXT:    .seh_endepilogue
+; UEFI64-NEXT:    jmp void_test # TAILCALL
+; UEFI64-NEXT:    .seh_endproc
 ;
 ; X86-LABEL: void_test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    jmp void_test # TAILCALL
   entry:
@@ -171,28 +167,24 @@ define dso_local tailcc i1 @i1test(i32, i32, i32, i32) {
 ;
 ; UEFI64-LABEL: i1test:
 ; UEFI64:       # %bb.0: # %entry
-; UEFI64-NEXT:   subq	$40, %rsp
-; UEFI64-NEXT:   .seh_stackalloc 40
-; UEFI64-NEXT:   .seh_endprologue
-; UEFI64-NEXT:   .seh_startepilogue
-; UEFI64-NEXT:   addq	$40, %rsp
-; UEFI64-NEXT:   .seh_endepilogue
-; UEFI64-NEXT:   jmp i1test # TAILCALL
+; UEFI64-NEXT:    subq $40, %rsp
+; UEFI64-NEXT:    .seh_stackalloc 40
+; UEFI64-NEXT:    .seh_endprologue
+; UEFI64-NEXT:    .seh_startepilogue
+; UEFI64-NEXT:    addq $40, %rsp
+; UEFI64-NEXT:    .seh_endepilogue
+; UEFI64-NEXT:    jmp i1test # TAILCALL
+; UEFI64-NEXT:    .seh_endproc
 ;
 ; X86-LABEL: i1test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    jmp i1test # TAILCALL
   entry:
diff --git a/llvm/test/CodeGen/X86/tailcall.ll b/llvm/test/CodeGen/X86/tailcall.ll
index 38df5e555df7d..cec06b96f0e96 100644
--- a/llvm/test/CodeGen/X86/tailcall.ll
+++ b/llvm/test/CodeGen/X86/tailcall.ll
@@ -64,18 +64,13 @@ define dso_local fastcc i32 @noret() nounwind {
 define dso_local fastcc void @void_test(i32, i32, i32, i32) {
 ; CHECK-LABEL: void_test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    subl $8, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    addl $8, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    jmp void_test # TAILCALL
   entry:
@@ -86,18 +81,13 @@ define dso_local fastcc void @void_test(i32, i32, i32, i32) {
 define dso_local fastcc i1 @i1test(i32, i32, i32, i32) {
 ; CHECK-LABEL: i1test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    subl $8, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    addl $8, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    jmp i1test # TAILCALL
   entry:
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 1d5da42a8c09b..de391b9311591 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -11,10 +11,10 @@ define dso_local i32 @func_35(i64 %p_38) nounwind ssp {
 ; CHECK-X86:       ## %bb.0: ## %entry
 ; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    movsbl _g_14, %eax
+; CHECK-X86-NEXT:    subl $8, %esp
 ; CHECK-X86-NEXT:    xorl %ecx, %ecx
 ; CHECK-X86-NEXT:    testl $255, %eax
 ; CHECK-X86-NEXT:    setg %cl
-; CHECK-X86-NEXT:    subl $8, %esp
 ; CHECK-X86-NEXT:    pushl %ecx
 ; CHECK-X86-NEXT:    pushl %eax
 ; CHECK-X86-NEXT:    calll _func_16
@@ -48,9 +48,9 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X86:       ## %bb.0:
 ; CHECK-X86-NEXT:    subl $12, %esp
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; CHECK-X86-NEXT:    cmpb $123, {{[0-9]+}}(%esp)
 ; CHECK-X86-NEXT:    sete %al
+; CHECK-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; CHECK-X86-NEXT:    testl $263, %ecx ## imm = 0x107
 ; CHECK-X86-NEXT:    je LBB1_3
 ; CHECK-X86-NEXT:  ## %bb.1:
diff --git a/llvm/test/CodeGen/X86/tls-desc.ll b/llvm/test/CodeGen/X86/tls-desc.ll
index c73986e69e791..3d5386e5a192a 100644
--- a/llvm/test/CodeGen/X86/tls-desc.ll
+++ b/llvm/test/CodeGen/X86/tls-desc.ll
@@ -14,7 +14,6 @@ define ptr @f1() nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll .L0$pb
 ; X86-NEXT:  .L0$pb:
 ; X86-NEXT:    popl %ebx
@@ -22,14 +21,11 @@ define ptr @f1() nounwind {
 ; X86-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %ebx
 ; X86-NEXT:    #APP
 ; X86-NEXT:    #NO_APP
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    #APP
+; X86-NEXT:    #NO_APP
 ; X86-NEXT:    leal x at tlsdesc(%ebx), %eax
 ; X86-NEXT:    calll *x at tlscall(%eax)
 ; X86-NEXT:    addl %gs:0, %eax
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    #APP
-; X86-NEXT:    #NO_APP
-; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index d9c21d3a3f570..a5e7a3b5fd249 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -13,10 +13,10 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub64_16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $48, %eax
+; X86-NEXT:    shrl $3, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $48, %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-NEXT:    movzwl (%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub64_16:
@@ -39,10 +39,10 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $112, %eax
+; X86-NEXT:    shrl $3, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $112, %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movzwl (%eax,%ecx), %eax
+; X86-NEXT:    movzwl (%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_16:
@@ -65,10 +65,10 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $96, %eax
+; X86-NEXT:    shrl $3, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $96, %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    movl (%eax,%ecx), %eax
+; X86-NEXT:    movl (%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_32:
@@ -91,11 +91,11 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $64, %ecx
+; X86-NEXT:    shrl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $64, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%ecx,%edx), %eax
-; X86-NEXT:    movl 4(%ecx,%edx), %edx
+; X86-NEXT:    movl (%edx,%ecx), %eax
+; X86-NEXT:    movl 4(%edx,%ecx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_64:
@@ -118,10 +118,10 @@ define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $63, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    movzbl (%eax,%ecx), %eax
+; X86-NEXT:    movzbl (%ecx,%eax), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub512_8:
@@ -144,11 +144,11 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $56, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $56, %edx
-; X86-NEXT:    movl (%ecx,%edx), %eax
-; X86-NEXT:    movl 4(%ecx,%edx), %edx
+; X86-NEXT:    movl (%edx,%ecx), %eax
+; X86-NEXT:    movl 4(%edx,%ecx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub512_64:
@@ -170,25 +170,21 @@ define i64 @extractSub512_64(ptr %word, i32 %idx) nounwind {
 define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $48, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $48, %edx
-; X86-NEXT:    movl (%ecx,%edx), %esi
-; X86-NEXT:    movl 4(%ecx,%edx), %edi
-; X86-NEXT:    movl 8(%ecx,%edx), %ebx
-; X86-NEXT:    movl 12(%ecx,%edx), %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl 12(%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movl 8(%edx,%ecx), %esi
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl (%edx,%ecx), %esi
+; X86-NEXT:    movl 4(%edx,%ecx), %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: extractSub512_128:
@@ -211,12 +207,12 @@ define i128 @extractSub512_128(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub4096_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub4096_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $4032, %edx # imm = 0xFC0
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    movl (%ecx,%edx), %eax
-; X86-NEXT:    movl 4(%ecx,%edx), %edx
+; X86-NEXT:    movl $4032, %ecx # imm = 0xFC0
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx,%ecx), %eax
+; X86-NEXT:    movl 4(%edx,%ecx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub4096_64:
diff --git a/llvm/test/CodeGen/X86/trunc-to-bool.ll b/llvm/test/CodeGen/X86/trunc-to-bool.ll
index 5a5d057597465..c5eb41a35277b 100644
--- a/llvm/test/CodeGen/X86/trunc-to-bool.ll
+++ b/llvm/test/CodeGen/X86/trunc-to-bool.ll
@@ -19,7 +19,7 @@ define i1 @test2(i32 %val, i32 %mask) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    btl %ecx, %eax
+; CHECK-NEXT:    btl %eax, %ecx
 ; CHECK-NEXT:    jae .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %ret_true
 ; CHECK-NEXT:    movb $1, %al
diff --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll
index 0a3c2ae344fd3..82f35056fdea8 100644
--- a/llvm/test/CodeGen/X86/uadd_sat.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat.ll
@@ -12,10 +12,10 @@ declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func:
@@ -31,11 +31,11 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovbl %ecx, %edx
 ; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    retl
@@ -100,10 +100,10 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl $15, %edx
 ; X86-NEXT:    cmpb $15, %al
-; X86-NEXT:    movl $15, %eax
-; X86-NEXT:    cmovbl %ecx, %eax
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movzbl %dl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func3:
@@ -122,30 +122,24 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    cmovbl %ebx, %ecx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl %ebx, %esi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebx, %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    cmovbl %ecx, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: vec:
diff --git a/llvm/test/CodeGen/X86/uadd_sat_plus.ll b/llvm/test/CodeGen/X86/uadd_sat_plus.ll
index 654e3d77f52aa..b5fa1242dfb8d 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_plus.ll
@@ -11,11 +11,11 @@ declare i64 @llvm.uadd.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: func32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func32:
@@ -33,11 +33,11 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-LABEL: func64:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovbl %ecx, %edx
 ; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    retl
@@ -113,10 +113,10 @@ define zeroext i4 @func4(i4 zeroext %x, i4 zeroext %y, i4 zeroext %z) nounwind {
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl $15, %edx
 ; X86-NEXT:    cmpb $15, %al
-; X86-NEXT:    movl $15, %eax
-; X86-NEXT:    cmovbl %ecx, %eax
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movzbl %dl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func4:
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index e4a21fcebcbe2..e9e4cc8df3fa6 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -314,11 +314,11 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: ucmp_wide_result:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %cl
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
@@ -467,37 +467,31 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_normal_vectors:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    seta %dl
-; X86-NEXT:    sbbb $0, %dl
-; X86-NEXT:    movsbl %dl, %edx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movsbl %bl, %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movsbl %bl, %esi
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movsbl %cl, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %1
@@ -608,33 +602,27 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_narrow_vec_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 1(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    seta %ch
-; X86-NEXT:    sbbb $0, %ch
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    seta %dl
-; X86-NEXT:    sbbb $0, %dl
-; X86-NEXT:    movb %dl, 3(%eax)
-; X86-NEXT:    movb %bl, 2(%eax)
-; X86-NEXT:    movb %ch, 1(%eax)
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
   %1 = call <4 x i8> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i8> %1
@@ -688,37 +676,31 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_narrow_vec_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movsbl %al, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    seta %dl
-; X86-NEXT:    sbbb $0, %dl
-; X86-NEXT:    movsbl %dl, %edx
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movsbl %bl, %esi
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    seta %ch
-; X86-NEXT:    sbbb $0, %ch
-; X86-NEXT:    movsbl %ch, %edi
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movsbl %cl, %ecx
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %1 = call <4 x i32> @llvm.ucmp(<4 x i8> %x, <4 x i8> %y)
   ret <4 x i32> %1
@@ -829,85 +811,6 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_wide_vec_result:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    seta %cl
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bh
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %bl
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %bh
-; X86-NEXT:    sbbb $0, %bh
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movsbl %al, %ebp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movsbl %al, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
@@ -915,37 +818,96 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    movsbl %al, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %edx, 56(%eax)
-; X86-NEXT:    movl %esi, 52(%eax)
-; X86-NEXT:    movl %ebp, 48(%eax)
-; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 52(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 44(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bh, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
 ; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movsbl (%esp), %edx # 1-byte Folded Reload
-; X86-NEXT:    movl %edx, 32(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bl, %edi
-; X86-NEXT:    movl %edi, 28(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    movl %ebx, 24(%eax)
-; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl %edx, 16(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movsbl %cl, %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <16 x i32> @llvm.ucmp(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i32> %1
@@ -1357,117 +1319,87 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 ;
 ; X86-LABEL: ucmp_wide_vec_op:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movb %cl, 10(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    seta %bh
-; X86-NEXT:    sbbb $0, %bh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %dh
-; X86-NEXT:    sbbb $0, %dh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %ch
-; X86-NEXT:    sbbb $0, %ch
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    seta %dl
-; X86-NEXT:    sbbb $0, %dl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    seta %cl
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %ch, 13(%eax)
-; X86-NEXT:    movb %dh, 12(%eax)
-; X86-NEXT:    movb %bl, 11(%eax)
-; X86-NEXT:    movb %bh, 10(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    seta %cl
+; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %1 = call <16 x i8> @llvm.ucmp(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i8> %1
@@ -2876,513 +2808,443 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $132, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    andl $127, %ebp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $127, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    andl $127, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    andl $127, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %edi, %ebp
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebx, %eax
-; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    cmpl %ebp, %edi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    setb %al
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    leal (%esi,%ecx,4), %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movzbl %cl, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    shll $6, %esi
+; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movzbl %cl, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $8, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $127, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    andl $3, %eax
+; X86-NEXT:    shll $10, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $127, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $12, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    shll $14, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    andl $127, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    setb %al
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %ebx
+; X86-NEXT:    andl $3, %ebx
+; X86-NEXT:    shll $18, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $127, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %bh
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %edi, %esi
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sbbb $0, %bh
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    shll $20, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $127, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %cl, 4(%edi)
-; X86-NEXT:    movzbl %bh, %ebp
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ebp
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    leal (%ecx,%ebp,4), %ecx
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $4, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    shll $6, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $8, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    andl $3, %edx
-; X86-NEXT:    shll $10, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $12, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $14, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $18, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $22, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $20, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $22, %ecx
-; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $24, %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shll $24, %eax
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movzbl %cl, %edi
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $26, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl $127, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movzbl %cl, %ebx
 ; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    shll $26, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $28, %ecx
+; X86-NEXT:    shll $28, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $127, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $127, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movb %dl, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbb $0, %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    shll $30, %ecx
 ; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    shll $30, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, (%edi)
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    addl $132, %esp
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 82dfeeee13293..1b180e9ec93ad 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -22,9 +22,9 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ;
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $7, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -90,8 +90,8 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -151,15 +151,14 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -176,8 +175,7 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %tmp = call i64 @llvm.udiv.fix.i64(i64 %x, i64 %y, i32 31)
@@ -225,9 +223,9 @@ define i16 @func7(i16 %x, i16 %y) nounwind {
 ;
 ; X86-LABEL: func7:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -280,66 +278,63 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shrl %ecx
-; X86-NEXT:    shll $31, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    shll $31, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    shll $31, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    shll $31, %edi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %ecx
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %eax, 12(%ebx)
+; X86-NEXT:    movl %edi, 8(%ebx)
+; X86-NEXT:    movl %esi, 4(%ebx)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, 12(%esi)
-; X86-NEXT:    movl %ebp, 8(%esi)
-; X86-NEXT:    movl %ebx, 4(%esi)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %eax, (%ebx)
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.udiv.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 31)
   ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index d1e9145c4ccee..78b891f796886 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -26,14 +26,14 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ;
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
-; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -70,8 +70,8 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-NEXT:    shll $14, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
-; X86-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
 ; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
 ; X86-NEXT:    cmovbl %eax, %ecx
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    movswl %cx, %eax
@@ -109,8 +109,8 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -118,11 +118,11 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86-NEXT:    divw %cx
 ; X86-NEXT:    # kill: def $ax killed $ax def $eax
 ; X86-NEXT:    movzwl %ax, %ecx
+; X86-NEXT:    movl $32767, %edx # imm = 0x7FFF
 ; X86-NEXT:    cmpl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    movl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    movswl %cx, %eax
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    movswl %dx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
@@ -157,11 +157,11 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-NEXT:    shlb $2, %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    divb %cl
-; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    movl $15, %ecx
 ; X86-NEXT:    cmpb $15, %al
-; X86-NEXT:    movl $15, %eax
-; X86-NEXT:    cmovbl %ecx, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
   %tmp = call i4 @llvm.udiv.fix.sat.i4(i4 %x, i4 %y, i32 2)
   ret i4 %tmp
@@ -192,15 +192,14 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl 16(%ebp), %edx
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
@@ -215,17 +214,16 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:  .LBB4_2:
-; X86-NEXT:    leal -4(%ebp), %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %tmp = call i64 @llvm.udiv.fix.sat.i64(i64 %x, i64 %y, i32 31)
@@ -256,8 +254,8 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    shll $7, %eax
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    divl %ecx
-; X86-NEXT:    cmpl $262143, %eax # imm = 0x3FFFF
 ; X86-NEXT:    movl $262143, %ecx # imm = 0x3FFFF
+; X86-NEXT:    cmpl $262143, %eax # imm = 0x3FFFF
 ; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    retl
   %x2 = sext i16 %x to i18
@@ -296,8 +294,8 @@ define i16 @func7(i16 %x, i16 %y) nounwind {
 ; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    movl $131071, %ecx # imm = 0x1FFFF
+; X86-NEXT:    cmpl $131071, %eax # imm = 0x1FFFF
 ; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
@@ -383,69 +381,70 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    calll __udivdi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, %edi
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovael %ebp, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovael %eax, %esi
-; X86-NEXT:    shrdl $1, %edx, %esi
-; X86-NEXT:    cmpl $2, %edi
-; X86-NEXT:    cmovael %ebp, %edi
+; X86-NEXT:    cmovael %edi, %edx
 ; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    shrdl $1, %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    cmpl $2, %ebp
+; X86-NEXT:    cmovael %edi, %ebp
 ; X86-NEXT:    cmovael %ecx, %ebx
-; X86-NEXT:    shrdl $1, %edi, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $1, %ebp, %ebx
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmpl $2, %eax
-; X86-NEXT:    cmovael %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovael %ecx, %edi
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl $0
+; X86-NEXT:    cmovael %edi, %eax
+; X86-NEXT:    cmovael %ecx, %esi
+; X86-NEXT:    movl $-1, %ebx
+; X86-NEXT:    shrdl $1, %eax, %esi
+; X86-NEXT:    movl %esi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, (%esp)
 ; X86-NEXT:    calll __udivdi3
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    cmpl $2, %edx
-; X86-NEXT:    cmovbl %edx, %ebp
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovael %ecx, %eax
-; X86-NEXT:    shrdl $1, %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    cmovbl %edx, %edi
+; X86-NEXT:    cmovael %ebx, %eax
+; X86-NEXT:    shrdl $1, %edi, %eax
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/uint64-to-float.ll b/llvm/test/CodeGen/X86/uint64-to-float.ll
index 03a8171589622..3a287bbba002d 100644
--- a/llvm/test/CodeGen/X86/uint64-to-float.ll
+++ b/llvm/test/CodeGen/X86/uint64-to-float.ll
@@ -15,9 +15,9 @@ define float @test(i64 %a) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -51,9 +51,9 @@ define float @test(i64 %a) nounwind {
 ; X86-WIN-NEXT:    movl %esp, %ebp
 ; X86-WIN-NEXT:    andl $-8, %esp
 ; X86-WIN-NEXT:    subl $24, %esp
-; X86-WIN-NEXT:    movl 12(%ebp), %eax
 ; X86-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-WIN-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; X86-WIN-NEXT:    movl 12(%ebp), %eax
 ; X86-WIN-NEXT:    shrl $31, %eax
 ; X86-WIN-NEXT:    fildll {{[0-9]+}}(%esp)
 ; X86-WIN-NEXT:    fnstcw {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/uint_to_fp.ll b/llvm/test/CodeGen/X86/uint_to_fp.ll
index 8c8cbb151974d..19ce2ae103f11 100644
--- a/llvm/test/CodeGen/X86/uint_to_fp.ll
+++ b/llvm/test/CodeGen/X86/uint_to_fp.ll
@@ -7,9 +7,9 @@ define void @test(i32 %x, ptr %y) nounwind {
 ; X86-LABEL: test:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $23, %ecx
-; X86-NEXT:    cvtsi2ss %ecx, %xmm0
+; X86-NEXT:    shrl $23, %eax
+; X86-NEXT:    cvtsi2ss %eax, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index 9943956e50efc..e2ceb0f62f9e4 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -207,8 +207,8 @@ define i64 @test_i64_1(i64 %a) nounwind {
 ; X86-NEXT:    cmpl $1, %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    movl $1, %eax
+; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    popl %esi
@@ -238,27 +238,27 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:    movl 44(%ebp), %edi
-; X86-NEXT:    cmpl 24(%ebp), %ebx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    cmpl 24(%ebp), %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl 28(%ebp), %eax
 ; X86-NEXT:    movl 48(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sbbl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    cmovbl 24(%ebp), %ebx
-; X86-NEXT:    cmovbl 28(%ebp), %edi
-; X86-NEXT:    cmovbl 32(%ebp), %edx
-; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovbl %ebx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    cmovbl 32(%ebp), %edx
 ; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovbl 28(%ebp), %edi
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    cmovbl 24(%ebp), %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -291,32 +291,31 @@ define i128 @test_i128_1(i128 %a) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    cmpl $0, 28(%ebp)
 ; X86-NEXT:    movl $1, %esi
 ; X86-NEXT:    cmovnel %eax, %esi
-; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    cmovel %edi, %esi
 ; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    movl $1, %ebx
 ; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    cmovbl 28(%ebp), %edi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    cmovel %esi, %ebx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    cmovel 28(%ebp), %edi
 ; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
@@ -369,37 +368,35 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X86-LABEL: test_v2i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    cmovbl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovbl %ebp, %esi
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    cmovbl %ecx, %esi
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %r
@@ -424,36 +421,32 @@ define <2 x i64> @test_v2i64_1(<2 x i64> %a) nounwind {
 ;
 ; X86-LABEL: test_v2i64_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl $1, %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl $1, %ecx
 ; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovel %ebp, %ecx
-; X86-NEXT:    cmovel %ebx, %ecx
-; X86-NEXT:    cmpl $1, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    cmovnel %edi, %ebp
-; X86-NEXT:    cmovel %ebx, %ebp
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    cmovel %ecx, %edx
+; X86-NEXT:    cmovel %edi, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    cmpl $1, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    cmovel %edi, %ecx
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> <i64 1, i64 1>)
   ret <2 x i64> %r
@@ -498,16 +491,14 @@ define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmoval %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    cmoval %esi, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    cmoval %ecx, %edx
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %r = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %r
@@ -533,24 +524,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v3i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmoval %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    cmoval %ebx, %eax
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    cmoval %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmoval %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    cmoval %esi, %ecx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %r = call <3 x i32> @llvm.umax.v3i32(<3 x i32> %a, <3 x i32> %b)
   ret <3 x i32> %r
@@ -576,31 +563,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmoval %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmoval %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmoval %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmoval %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -627,27 +610,23 @@ define <4 x i32> @test_v4i32_1(<4 x i32> %a) nounwind {
 ;
 ; X86-LABEL: test_v4i32_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl $1, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    cmpl $1, %edx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    cmpl $1, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    cmpl $1, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
   %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %r
@@ -690,62 +669,47 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebp, %eax
-; X86-NEXT:    cmoval %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    cmoval %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmoval %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmoval %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmoval %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmoval %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %edx, 24(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %r
@@ -779,54 +743,39 @@ define <8 x i32> @test_v8i32_1(<8 x i32> %a) nounwind {
 ;
 ; X86-LABEL: test_v8i32_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl $1, %edx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    cmpl $1, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    cmpl $1, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    cmpl $1, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    cmpl $1, %ebp
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    cmpl $1, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%ecx)
-; X86-NEXT:    movl %ebp, 20(%ecx)
-; X86-NEXT:    movl %ebx, 16(%ecx)
-; X86-NEXT:    movl %edi, 12(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $1, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>)
   ret <8 x i32> %r
@@ -846,62 +795,47 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bp, %ax
-; X86-NEXT:    cmoval %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bx, %ax
-; X86-NEXT:    cmoval %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmoval %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    cmoval %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmoval %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpw %cx, %ax
 ; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmoval %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %dx, 12(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 10(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 8(%ecx)
-; X86-NEXT:    movw %si, 6(%ecx)
-; X86-NEXT:    movw %di, 4(%ecx)
-; X86-NEXT:    movw %bx, 2(%ecx)
-; X86-NEXT:    movw %bp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, 6(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %r
@@ -922,54 +856,39 @@ define <8 x i16> @test_v8i16_1(<8 x i16> %a) nounwind {
 ;
 ; X86-LABEL: test_v8i16_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpw $1, %dx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    cmpw $1, %bp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    cmpw $1, %bx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    cmpw $1, %di
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    cmpw $1, %si
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    cmpw $1, %cx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpw $1, %ax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw $1, %ax
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 10(%ecx)
-; X86-NEXT:    movw %si, 8(%ecx)
-; X86-NEXT:    movw %di, 6(%ecx)
-; X86-NEXT:    movw %bx, 4(%ecx)
-; X86-NEXT:    movw %bp, 2(%ecx)
-; X86-NEXT:    movw %dx, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, 6(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw $1, %cx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
   ret <8 x i16> %r
@@ -988,123 +907,87 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X86-LABEL: test_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmoval %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmoval %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb %cl, %al
 ; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movb %cl, 15(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmoval %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmoval %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %bl, 13(%eax)
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmoval %edx, %ecx
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %r
@@ -1123,107 +1006,71 @@ define <16 x i8> @test_v16i8_1(<16 x i8> %a) nounwind {
 ;
 ; X86-LABEL: test_v16i8_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpb $1, %bl
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpb $1, %dl
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpb $1, %cl
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb $1, %al
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpb $1, %bl
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb $1, %dl
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movb %cl, 14(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpb $1, %cl
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %bl, 13(%eax)
-; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb $1, %cl
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <16 x i8> %r
@@ -1243,11 +1090,11 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $15, %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $15, %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovbel %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
   %ax = ashr i16 %a, 15
@@ -1268,9 +1115,9 @@ define i32 @test_signbits_i32(i32 %a, i32 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $17, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    cmoval %ecx, %eax
 ; X86-NEXT:    retl
@@ -1323,19 +1170,19 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    shrdl $28, %ecx, %edx
 ; X86-NEXT:    sarl $28, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    cmpl %esi, %edx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    cmovbl %esi, %edx
 ; X86-NEXT:    cmovbl %eax, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    cmovbl %esi, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 79e6eb32605d9..5e2d71da457ce 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -154,26 +154,27 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 44(%ebp), %edx
-; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    cmpl %ecx, 24(%ebp)
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl 48(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl 52(%ebp), %ebx
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %eax, 12(%edx)
+; X86-NEXT:    cmovbl 32(%ebp), %ebx
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    cmovbl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, 4(%edx)
 ; X86-NEXT:    cmovbl 24(%ebp), %ecx
-; X86-NEXT:    cmovbl 28(%ebp), %edx
-; X86-NEXT:    cmovbl 32(%ebp), %esi
-; X86-NEXT:    cmovbl %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -223,16 +224,14 @@ define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    cmovbl %esi, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    cmovbl %ecx, %edx
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %r = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %r
@@ -258,24 +257,20 @@ define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v3i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    cmovbl %ebx, %eax
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    cmovbl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    cmovbl %esi, %ecx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %r = call <3 x i32> @llvm.umin.v3i32(<3 x i32> %a, <3 x i32> %b)
   ret <3 x i32> %r
@@ -301,31 +296,27 @@ define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovbl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmovbl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovbl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmovbl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -368,62 +359,47 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebp, %eax
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edi, %eax
-; X86-NEXT:    cmovbl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    cmovbl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %edx, %eax
-; X86-NEXT:    cmovbl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 28(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %edx, 24(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %ebp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %ecx, %edx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %r
@@ -444,62 +420,47 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; X86-LABEL: test_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bp, %ax
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %bx, %ax
-; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %di, %ax
-; X86-NEXT:    cmovbl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    cmovbl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpw %cx, %ax
 ; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovbl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %cx, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, 12(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %dx, 12(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 10(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 8(%ecx)
-; X86-NEXT:    movw %si, 6(%ecx)
-; X86-NEXT:    movw %di, 4(%ecx)
-; X86-NEXT:    movw %bx, 2(%ecx)
-; X86-NEXT:    movw %bp, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, 6(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, 2(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpw %cx, %dx
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movw %cx, (%eax)
 ; X86-NEXT:    retl $4
   %r = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %r
@@ -518,123 +479,87 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X86-LABEL: test_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmovbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb %cl, %al
 ; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movb %cl, 15(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movb %cl, 14(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    cmovbl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    movb %cl, 13(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpb %cl, %al
-; X86-NEXT:    cmovbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %cl, 15(%eax)
-; X86-NEXT:    movb %dl, 14(%eax)
-; X86-NEXT:    movb %bl, 13(%eax)
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpb %cl, %dl
+; X86-NEXT:    cmovbl %edx, %ecx
 ; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $40, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b)
   ret <16 x i8> %r
@@ -654,11 +579,11 @@ define i16 @test_signbits_i16(i16 %a, i16 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shrl $15, %ecx
-; X86-NEXT:    cmpw %ax, %cx
-; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrl $15, %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
   %ax = ashr i16 %a, 15
@@ -679,9 +604,9 @@ define i32 @test_signbits_i32(i32 %a, i32 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $17, %eax
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    retl
@@ -734,19 +659,19 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    shrdl $28, %ecx, %edx
 ; X86-NEXT:    sarl $28, %ecx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    cmovbl %esi, %edx
 ; X86-NEXT:    cmovbl %eax, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    cmovbl %esi, %edx
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, 12(%eax)
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c..18960f8f893fe 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -80,133 +80,135 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $76, %esp
-; X86-NEXT:    movl $4095, %ecx # imm = 0xFFF
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %ebx, (%edx)
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -215,7 +217,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
@@ -235,7 +237,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -254,199 +256,200 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ebx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, 28(%eax)
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %ecx, %ebp
@@ -457,54 +460,40 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl $4095, %ebp # imm = 0xFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl %ebp, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    imull %edx, %edi
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edx, %esi
+; X86-NEXT:    imull %edx, %ebp
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edx, %esi
-; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 16(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 24(%ecx)
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 28(%ecx)
 ; X86-NEXT:    movl %eax, 32(%ecx)
-; X86-NEXT:    andl $4095, %ebx # imm = 0xFFF
-; X86-NEXT:    movw %bx, 36(%ecx)
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    andl $4095, %edi # imm = 0xFFF
+; X86-NEXT:    movw %di, 36(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index 5a68484596a2f..6452eb0038ddb 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -116,38 +116,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl $30, %eax, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shldl $30, %eax, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shldl $30, %eax, %ebp
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shldl $30, %eax, %edx
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $30, %eax, %edx
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %ebp, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
   ret <4 x i32> %tmp
@@ -230,23 +216,19 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0)
   ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 8c7078c726328..9bbdb097baf07 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -26,8 +26,8 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shrdl $2, %edx, %eax
-; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmpl $4, %edx
 ; X86-NEXT:    cmovael %ecx, %eax
 ; X86-NEXT:    retl
   %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 2)
@@ -76,11 +76,12 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    shrdl $2, %eax, %ecx
 ; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    movl $-1, %esi
 ; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    cmovel %eax, %edx
+; X86-NEXT:    cmovnel %esi, %ecx
+; X86-NEXT:    cmovel %eax, %esi
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -115,19 +116,19 @@ define i4 @func3(i4 %x, i4 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl %al, %edx
 ; X86-NEXT:    shlb $4, %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    imull %edx, %eax
-; X86-NEXT:    movb %ah, %cl
-; X86-NEXT:    shlb $6, %cl
-; X86-NEXT:    shrb $2, %al
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    cmpb $4, %ah
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    shlb $6, %al
+; X86-NEXT:    shrb $2, %cl
+; X86-NEXT:    orb %al, %cl
+; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    movl $255, %eax
-; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    cmpb $4, %ch
+; X86-NEXT:    cmovbl %edx, %eax
 ; X86-NEXT:    shrb $4, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
@@ -189,46 +190,35 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrdl $2, %edx, %esi
+; X86-NEXT:    shrdl $2, %edx, %eax
+; X86-NEXT:    movl $-1, %esi
 ; X86-NEXT:    cmpl $4, %edx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovael %ecx, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shrdl $2, %edx, %ebx
+; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $4, %edx
-; X86-NEXT:    cmovael %ecx, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    shrdl $2, %edx, %ebp
+; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $4, %edx
-; X86-NEXT:    cmovael %ecx, %ebp
+; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    shrdl $2, %edx, %eax
 ; X86-NEXT:    cmpl $4, %edx
-; X86-NEXT:    cmovael %ecx, %eax
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %ebp, 8(%edi)
-; X86-NEXT:    movl %ebx, 4(%edi)
-; X86-NEXT:    movl %esi, (%edi)
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    cmovael %esi, %eax
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
   ret <4 x i32> %tmp
@@ -246,9 +236,9 @@ define i32 @func4(i32 %x, i32 %y) nounwind {
 ;
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovol %ecx, %eax
 ; X86-NEXT:    retl
   %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 0)
@@ -278,28 +268,28 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %cl
-; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    setne %bl
+; X86-NEXT:    andb %dl, %bl
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bl
+; X86-NEXT:    seto %cl
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
-; X86-NEXT:    orb %cl, %ch
-; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    seto %bh
+; X86-NEXT:    orb %cl, %bh
+; X86-NEXT:    orb %bl, %bh
+; X86-NEXT:    leal (%edi,%eax), %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    orb %bh, %cl
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -333,10 +323,10 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
 ;
 ; X86-LABEL: func6:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shlb $4, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    mulb %cl
 ; X86-NEXT:    movzbl %al, %ecx
 ; X86-NEXT:    movl $255, %eax
@@ -386,38 +376,27 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl $-1, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    cmovol %edi, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovol %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    cmovol %edi, %ebx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    cmovol %esi, %eax
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    cmovol %edi, %ebp
+; X86-NEXT:    cmovol %esi, %eax
+; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    cmovol %edi, %eax
-; X86-NEXT:    movl %eax, 12(%ecx)
-; X86-NEXT:    movl %ebp, 8(%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    cmovol %esi, %eax
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0)
   ret <4 x i32> %tmp
@@ -519,10 +498,10 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    shrdl $31, %edx, %eax
+; X86-NEXT:    movl $-1, %esi
 ; X86-NEXT:    testl $-2147483648, %edx # imm = 0x80000000
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovnel %esi, %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 89afd1b00444b..b44706fc3f0a4 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -44,59 +44,59 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    leal (%ecx,%eax), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    leal (%ecx,%eax), %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    leal (%esi,%eax), %esi
+; X86-NEXT:    leal (%ecx,%eax), %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
@@ -134,9 +134,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    setne %al
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 132683cdb0f9e..1aeffd9cf5122 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -16,28 +16,28 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %cl
-; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    setne %bl
+; X86-NEXT:    andb %dl, %bl
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bl
+; X86-NEXT:    seto %cl
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
-; X86-NEXT:    orb %cl, %ch
-; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    seto %bh
+; X86-NEXT:    orb %cl, %bh
+; X86-NEXT:    orb %bl, %bh
+; X86-NEXT:    leal (%edi,%eax), %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    orb %bh, %cl
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll
index 8b7d87da1d6e9..b80e741f3d16e 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll
@@ -47,9 +47,9 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone {
 define i32 @test_optsize(i32 %X) optsize nounwind readnone {
 ; X86-LABEL: test_optsize:
 ; X86:       # %bb.0:
-; X86-NEXT:    imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD
-; X86-NEXT:    cmpl $858993460, %eax # imm = 0x33333334
 ; X86-NEXT:    movl $42, %eax
+; X86-NEXT:    imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD
+; X86-NEXT:    cmpl $858993460, %ecx # imm = 0x33333334
 ; X86-NEXT:    jb .LBB1_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    movl $-10, %eax
diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index 9985ac61a4e54..f9bcca75a0991 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -27,18 +27,20 @@ define i16 @func(i16 %x, i16 %y) nounwind {
 ;
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl %dx, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movzwl %si, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    cmpw %di, %dx
+; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
   %tmp = call i16 @llvm.ushl.sat.i16(i16 %x, i16 %y)
   ret i16 %tmp
@@ -66,17 +68,17 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movzwl %dx, %esi
 ; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovel %edx, %eax
-; X86-NEXT:    cwtl
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movswl %cx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -111,18 +113,18 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
-; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movzwl %dx, %esi
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
 ; X86-NEXT:    cmpw %si, %ax
-; X86-NEXT:    movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT:    cmovel %edx, %eax
-; X86-NEXT:    cwtl
+; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    movswl %cx, %eax
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    popl %esi
@@ -156,16 +158,16 @@ define i4 @func4(i4 %x, i4 %y) nounwind {
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    shlb $4, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shlb $4, %dl
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %esi
+; X86-NEXT:    shrb %cl, %ch
 ; X86-NEXT:    movl $255, %eax
+; X86-NEXT:    cmpb %ch, %dl
 ; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    shrb $4, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -248,19 +250,21 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
 ;
 ; X86-LABEL: func6:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $14, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll $14, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    shrl $14, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
   %x2 = sext i16 %x to i18
   %y2 = sext i16 %y to i18
@@ -284,17 +288,19 @@ define i32 @func7(i32 %x, i32 %y) nounwind {
 ;
 ; X86-LABEL: func7:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovel %edx, %eax
+; X86-NEXT:    cmpl %edi, %edx
+; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
   %tmp = call i32 @llvm.ushl.sat.i32(i32 %x, i32 %y)
   ret i32 %tmp
@@ -320,13 +326,13 @@ define i8 @func8(i8 %x, i8 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %esi
+; X86-NEXT:    shrb %cl, %ch
 ; X86-NEXT:    movl $255, %eax
+; X86-NEXT:    cmpb %ch, %dl
 ; X86-NEXT:    cmovel %esi, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index bd4743dca545d..c06911fb6fe14 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -66,46 +66,47 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    shll %cl, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    cmovnel %ebx, %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    cmovnel %edi, %edx
+; X86-NEXT:    cmovnel %ebx, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovel %edi, %ebx
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrdl %cl, %ebp, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shrdl %cl, %edx, %ebp
 ; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    cmovnel %edi, %ebp
+; X86-NEXT:    cmovnel %esi, %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    cmovnel %ecx, %edi
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    cmovnel %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    orl %ebp, %ebx
-; X86-NEXT:    cmovnel %ecx, %esi
-; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %eax, 12(%ecx)
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovnel %eax, %edx
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    cmovnel %eax, %edi
 ; X86-NEXT:    movl %edi, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    addl $16, %esp
@@ -164,56 +165,50 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X86-LABEL: vec_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    cmpl %ebp, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    cmpl %edi, %eax
 ; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movb %ah, %cl
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    cmpl %ebp, %edi
-; X86-NEXT:    cmovnel %ebx, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovnel %eax, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpl %ebx, %esi
+; X86-NEXT:    cmovnel %edx, %edi
 ; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpl %ebx, %esi
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpl %ebx, %esi
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <4 x i32> @llvm.ushl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
@@ -297,102 +292,86 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ;
 ; X86-LABEL: vec_v8i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpw %si, %dx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movzwl %si, %edi
+; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    movl $65535, %edx # imm = 0xFFFF
-; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movzwl %bp, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpw %ax, %di
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmpw %di, %ax
+; X86-NEXT:    cmovnel %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movw %si, 14(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    movzwl %bx, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $65535, %esi # imm = 0xFFFF
-; X86-NEXT:    cmovnel %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl %di, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, 10(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movzwl %bp, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, 8(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, 6(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movzwl %di, %edx
-; X86-NEXT:    shrl %cl, %edx
-; X86-NEXT:    cmpw %dx, %ax
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl $65535, %ebx # imm = 0xFFFF
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzwl %dx, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpw %ax, %si
-; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, 2(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzwl %ax, %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    cmpw %si, %bx
-; X86-NEXT:    movl $65535, %ecx # imm = 0xFFFF
-; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movw %ax, 14(%ecx)
-; X86-NEXT:    movw %dx, 12(%ecx)
-; X86-NEXT:    movw %di, 10(%ecx)
-; X86-NEXT:    movw %bp, 8(%ecx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 6(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, 2(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movw %ax, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movzwl %di, %ebx
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    cmpw %bx, %si
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    movw %di, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <8 x i16> @llvm.ushl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %tmp
@@ -481,191 +460,156 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ;
 ; X86-LABEL: vec_v16i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movb %bl, %bh
-; X86-NEXT:    shlb %cl, %bh
-; X86-NEXT:    movzbl %bh, %edi
-; X86-NEXT:    shrb %cl, %bh
-; X86-NEXT:    cmpb %bh, %bl
-; X86-NEXT:    movl $255, %esi
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %dh, %bl
-; X86-NEXT:    movb %ah, %cl
-; X86-NEXT:    shlb %cl, %bl
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    shrb %cl, %bl
-; X86-NEXT:    cmpb %bl, %dh
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %ch, %ah
-; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    movb %al, %ah
 ; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movzbl %ah, %edi
+; X86-NEXT:    movzbl %ah, %ebx
 ; X86-NEXT:    shrb %cl, %ah
+; X86-NEXT:    movl $255, %esi
+; X86-NEXT:    cmpb %ah, %al
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %bl, 15(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb %ah, %ch
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %dl, %ah
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movzbl %ah, %edi
-; X86-NEXT:    shrb %cl, %ah
-; X86-NEXT:    cmpb %ah, %dl
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 14(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 13(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 12(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 11(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 10(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 9(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 8(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 7(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 6(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %ebp
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 5(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %edi
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
-; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 4(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    shlb %cl, %dl
-; X86-NEXT:    movzbl %dl, %ebx
-; X86-NEXT:    shrb %cl, %dl
-; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
 ; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 3(%eax)
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movzbl %ah, %edx
-; X86-NEXT:    shrb %cl, %ah
-; X86-NEXT:    cmpb %ah, %al
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %dl, 15(%eax)
-; X86-NEXT:    movb %bl, 14(%eax)
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movb %cl, 13(%eax)
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movb %cl, 12(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 11(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 10(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 9(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 7(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 5(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 3(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 2(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, 1(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movb %cl, (%eax)
-; X86-NEXT:    addl $48, %esp
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 2(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, 1(%eax)
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movb %dl, %ch
+; X86-NEXT:    shlb %cl, %ch
+; X86-NEXT:    movzbl %ch, %ebx
+; X86-NEXT:    shrb %cl, %ch
+; X86-NEXT:    cmpb %ch, %dl
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movb %bl, (%eax)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %tmp = call <16 x i8> @llvm.ushl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %tmp
diff --git a/llvm/test/CodeGen/X86/usub_sat.ll b/llvm/test/CodeGen/X86/usub_sat.ll
index 6749a1f9147af..5c04cde42260b 100644
--- a/llvm/test/CodeGen/X86/usub_sat.ll
+++ b/llvm/test/CodeGen/X86/usub_sat.ll
@@ -12,8 +12,8 @@ declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    retl
@@ -31,9 +31,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovbl %ecx, %edx
@@ -53,8 +53,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -74,8 +74,8 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y) nounwind {
 define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
 ; X86-LABEL: func8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    cmovbl %ecx, %eax
@@ -97,12 +97,12 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y) nounwind {
 define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
 ; X86-LABEL: func3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovbl %ecx, %eax
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func3:
@@ -120,30 +120,24 @@ define zeroext i4 @func3(i4 zeroext %x, i4 zeroext %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %ebx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl %ebx, %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebx, %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    cmovbl %ecx, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: vec:
diff --git a/llvm/test/CodeGen/X86/usub_sat_plus.ll b/llvm/test/CodeGen/X86/usub_sat_plus.ll
index 0fb14ad5cf7b0..a0ef94f0748aa 100644
--- a/llvm/test/CodeGen/X86/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_plus.ll
@@ -11,10 +11,10 @@ declare i64 @llvm.usub.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: func32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    cmovbl %edx, %eax
 ; X86-NEXT:    retl
@@ -34,9 +34,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-LABEL: func64:
 ; X86:       # %bb.0:
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    cmovbl %ecx, %edx
@@ -57,10 +57,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y, i16 zeroext %z) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imulw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    subw %cx, %ax
 ; X86-NEXT:    cmovbl %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -82,13 +82,13 @@ define zeroext i16 @func16(i16 zeroext %x, i16 zeroext %y, i16 zeroext %z) nounw
 define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y, i8 zeroext %z) nounwind {
 ; X86-LABEL: func8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mulb {{[0-9]+}}(%esp)
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -111,14 +111,14 @@ define zeroext i8 @func8(i8 zeroext %x, i8 zeroext %y, i8 zeroext %z) nounwind {
 define zeroext i4 @func4(i4 zeroext %x, i4 zeroext %y, i4 zeroext %z) nounwind {
 ; X86-LABEL: func4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mulb {{[0-9]+}}(%esp)
 ; X86-NEXT:    andb $15, %al
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subb %al, %dl
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/v2f32.ll b/llvm/test/CodeGen/X86/v2f32.ll
index f1f8b92208faf..67f4b6db786bb 100644
--- a/llvm/test/CodeGen/X86/v2f32.ll
+++ b/llvm/test/CodeGen/X86/v2f32.ll
@@ -13,9 +13,9 @@ define void @test1(<2 x float> %Q, ptr%P2) nounwind {
 ;
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    addss %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movss %xmm1, (%eax)
 ; X86-NEXT:    retl
   %a = extractelement <2 x float> %Q, i32 0
diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll
index 01f10372eaa2d..1be4c987aecc6 100644
--- a/llvm/test/CodeGen/X86/v8i1-masks.ll
+++ b/llvm/test/CodeGen/X86/v8i1-masks.ll
@@ -10,11 +10,11 @@ define void @and_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
 ; X86-LABEL: and_masks:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovups (%edx), %ymm0
-; X86-NEXT:    vmovups (%ecx), %ymm1
+; X86-NEXT:    vmovups (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovups (%eax), %ymm1
 ; X86-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovups (%eax), %ymm2
 ; X86-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
 ; X86-NEXT:    vandps %ymm1, %ymm0, %ymm0
@@ -39,11 +39,11 @@ define void @and_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
 ; X86-AVX2-LABEL: and_masks:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX2-NEXT:    vmovups (%ecx), %ymm1
+; X86-AVX2-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vmovups (%eax), %ymm1
 ; X86-AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovups (%eax), %ymm2
 ; X86-AVX2-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
 ; X86-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
@@ -68,10 +68,10 @@ define void @and_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
 ; X86-AVX512-LABEL: and_masks:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX512-NEXT:    vcmpgtps (%ecx), %ymm0, %k1
+; X86-AVX512-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vcmpgtps (%eax), %ymm0, %k1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vcmpgtps (%eax), %ymm0, %k1 {%k1}
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -106,8 +106,8 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
 ; X86-LABEL: neg_masks:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovups (%ecx), %ymm0
+; X86-NEXT:    vmovups (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vcmpnltps (%eax), %ymm0, %ymm0
 ; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
@@ -126,8 +126,8 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
 ; X86-AVX2-LABEL: neg_masks:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    vmovups (%ecx), %ymm0
+; X86-AVX2-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vcmpnltps (%eax), %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
 ; X86-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
@@ -148,8 +148,8 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
 ; X86-AVX512-LABEL: neg_masks:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vmovups (%ecx), %ymm0
+; X86-AVX512-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vcmpnltps (%eax), %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa %ymm0, (%eax)
@@ -252,11 +252,11 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 {
 ;
 ; X86-AVX2-LABEL: two_ands:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vandps %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: two_ands:
@@ -294,13 +294,13 @@ entry:
 define <8 x i32> @three_ands(<8 x float> %x) {
 ; X86-LABEL: three_ands:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: three_ands:
@@ -316,11 +316,11 @@ define <8 x i32> @three_ands(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: three_ands:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vandps %ymm0, %ymm1, %ymm0
@@ -370,13 +370,13 @@ entry:
 define <8 x i32> @four_ands(<8 x float> %x) {
 ; X86-LABEL: four_ands:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X86-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vandps %ymm1, %ymm3, %ymm1
+; X86-NEXT:    vandps %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
@@ -396,11 +396,11 @@ define <8 x i32> @four_ands(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: four_ands:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
@@ -460,13 +460,13 @@ entry:
 define <8 x i32> @five_ands(<8 x float> %x) {
 ; X86-LABEL: five_ands:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X86-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vandps %ymm1, %ymm3, %ymm1
+; X86-NEXT:    vandps %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vandps %ymm0, %ymm2, %ymm0
@@ -490,11 +490,11 @@ define <8 x i32> @five_ands(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: five_ands:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
@@ -580,11 +580,11 @@ define <8 x i32> @two_or(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: two_or:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: two_or:
@@ -598,9 +598,9 @@ define <8 x i32> @two_or(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: two_or:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k1
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    korw %k0, %k1, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -624,13 +624,13 @@ entry:
 define <8 x i32> @three_or(<8 x float> %x) {
 ; X86-LABEL: three_or:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm0, %ymm2, %ymm0
-; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: three_or:
@@ -646,11 +646,11 @@ define <8 x i32> @three_or(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: three_or:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -670,12 +670,12 @@ define <8 x i32> @three_or(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: three_or:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k1
+; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k0
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
+; X86-AVX512-NEXT:    korw %k1, %k2, %k1
+; X86-AVX512-NEXT:    korw %k0, %k1, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -704,13 +704,13 @@ entry:
 define <8 x i32> @four_or(<8 x float> %x) {
 ; X86-LABEL: four_or:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X86-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; X86-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
@@ -730,11 +730,11 @@ define <8 x i32> @four_or(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: four_or:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
@@ -760,12 +760,12 @@ define <8 x i32> @four_or(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: four_or:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k0
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
+; X86-AVX512-NEXT:    korw %k1, %k2, %k1
+; X86-AVX512-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
 ; X86-AVX512-NEXT:    korw %k1, %k0, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -800,13 +800,13 @@ entry:
 define <8 x i32> @five_or(<8 x float> %x) {
 ; X86-LABEL: five_or:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X86-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vorps %ymm1, %ymm3, %ymm1
+; X86-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm0, %ymm2, %ymm0
@@ -830,11 +830,11 @@ define <8 x i32> @five_or(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: five_or:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
@@ -866,12 +866,12 @@ define <8 x i32> @five_or(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: five_or:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
-; X86-AVX512-NEXT:    korw %k1, %k0, %k0
+; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k0
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
+; X86-AVX512-NEXT:    korw %k1, %k2, %k1
+; X86-AVX512-NEXT:    korw %k0, %k1, %k0
 ; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
 ; X86-AVX512-NEXT:    korw %k1, %k0, %k0
 ; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
@@ -914,10 +914,10 @@ define <8 x i32> @three_or_and(<8 x float> %x) {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
@@ -934,14 +934,14 @@ define <8 x i32> @three_or_and(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: three_or_and:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: three_or_and:
@@ -958,11 +958,11 @@ define <8 x i32> @three_or_and(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: three_or_and:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
-; X86-AVX512-NEXT:    korw %k0, %k1, %k1
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    korw %k1, %k0, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -990,15 +990,15 @@ entry:
 define <8 x i32> @four_or_and(<8 x float> %x) {
 ; X86-LABEL: four_or_and:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X86-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; X86-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: four_or_and:
@@ -1016,17 +1016,17 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: four_or_and:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm3, %ymm0
+; X86-AVX2-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: four_or_and:
@@ -1046,12 +1046,12 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: four_or_and:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
-; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
-; X86-AVX512-NEXT:    korw %k1, %k0, %k1
+; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
+; X86-AVX512-NEXT:    korw %k0, %k1, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -1084,14 +1084,14 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vandps %ymm2, %ymm3, %ymm2
 ; X86-NEXT:    vorps %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; X86-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
 ;
@@ -1112,20 +1112,20 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: five_or_and:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X86-AVX2-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm3, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vandps %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm3, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: five_or_and:
@@ -1148,14 +1148,14 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: five_or_and:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
+; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
 ; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
-; X86-AVX512-NEXT:    korw %k0, %k1, %k0
-; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
-; X86-AVX512-NEXT:    korw %k1, %k0, %k1
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
+; X86-AVX512-NEXT:    korw %k2, %k1, %k1
+; X86-AVX512-NEXT:    korw %k0, %k1, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -1190,15 +1190,15 @@ entry:
 define <8 x i32> @four_or_and_xor(<8 x float> %x) {
 ; X86-LABEL: four_or_and_xor:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vxorps %ymm0, %ymm2, %ymm0
+; X86-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: four_or_and_xor:
@@ -1216,17 +1216,17 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: four_or_and_xor:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vxorps %ymm2, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
 ; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm3, %ymm0
+; X86-AVX2-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: four_or_and_xor:
@@ -1246,13 +1246,13 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: four_or_and_xor:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
-; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
-; X86-AVX512-NEXT:    korw %k1, %k0, %k1
+; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
+; X86-AVX512-NEXT:    kxorw %k1, %k2, %k1
+; X86-AVX512-NEXT:    korw %k0, %k1, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -1284,17 +1284,17 @@ entry:
 define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ; X86-LABEL: five_or_and_xor:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vxorps %ymm3, %ymm2, %ymm2
-; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
-; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT:    vandps %ymm0, %ymm3, %ymm0
-; X86-NEXT:    vxorps %ymm0, %ymm2, %ymm0
-; X86-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
+; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vxorps %ymm2, %ymm3, %ymm2
+; X86-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; X86-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: five_or_and_xor:
@@ -1314,20 +1314,20 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ;
 ; X86-AVX2-LABEL: five_or_and_xor:
 ; X86-AVX2:       ## %bb.0: ## %entry
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X86-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vxorps %ymm3, %ymm2, %ymm2
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vandps %ymm0, %ymm3, %ymm0
-; X86-AVX2-NEXT:    vxorps %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm1, %ymm0, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm3, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorps %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: five_or_and_xor:
@@ -1350,15 +1350,15 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: five_or_and_xor:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k2
-; X86-AVX512-NEXT:    kxorw %k2, %k1, %k1
-; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
-; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 {%k2}
-; X86-AVX512-NEXT:    kxorw %k2, %k1, %k1
-; X86-AVX512-NEXT:    korw %k0, %k1, %k1
+; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
+; X86-AVX512-NEXT:    kxorw %k1, %k2, %k1
+; X86-AVX512-NEXT:    kxorw %k0, %k1, %k0
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
+; X86-AVX512-NEXT:    korw %k1, %k0, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; X86-AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; X86-AVX512-NEXT:    retl
@@ -1395,14 +1395,14 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; X86-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4
-; X86-NEXT:    vandps %ymm4, %ymm3, %ymm3
-; X86-NEXT:    vandps %ymm2, %ymm3, %ymm2
+; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
-; X86-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; X86-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X86-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT:    vandps %ymm3, %ymm2, %ymm2
 ; X86-NEXT:    vxorps %ymm2, %ymm1, %ymm1
 ; X86-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X86-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -1429,17 +1429,17 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ; X86-AVX2:       ## %bb.0: ## %entry
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; X86-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; X86-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm4
-; X86-AVX2-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; X86-AVX2-NEXT:    vandps %ymm2, %ymm3, %ymm2
-; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X86-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vxorps %ymm1, %ymm3, %ymm1
+; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT:    vcmpltps %ymm3, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vxorps %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
 ; X86-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
@@ -1469,14 +1469,14 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ;
 ; X86-AVX512-LABEL: six_or_and_xor:
 ; X86-AVX512:       ## %bb.0: ## %entry
-; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vcmpneqps %ymm1, %ymm0, %k1
 ; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
-; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1 {%k1}
+; X86-AVX512-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k0 {%k1}
+; X86-AVX512-NEXT:    vcmpgeps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
 ; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
-; X86-AVX512-NEXT:    kxorw %k0, %k2, %k0
-; X86-AVX512-NEXT:    kxorw %k1, %k0, %k0
+; X86-AVX512-NEXT:    kxorw %k1, %k2, %k1
+; X86-AVX512-NEXT:    kxorw %k0, %k1, %k0
 ; X86-AVX512-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
 ; X86-AVX512-NEXT:    korw %k1, %k0, %k1
 ; X86-AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vaargs-win32.ll b/llvm/test/CodeGen/X86/vaargs-win32.ll
index 38c7b73ebab49..c9ac21591837e 100644
--- a/llvm/test/CodeGen/X86/vaargs-win32.ll
+++ b/llvm/test/CodeGen/X86/vaargs-win32.ll
@@ -38,8 +38,8 @@ define <4 x i32> @foo(<4 x float> inreg %0, ...) nounwind {
 ; MSVC-LABEL: foo:
 ; MSVC:       # %bb.0:
 ; MSVC-NEXT:    pushl %eax
-; MSVC-NEXT:    movups 8(%esp), %xmm0
 ; MSVC-NEXT:    movups 24(%esp), %xmm1
+; MSVC-NEXT:    movups 8(%esp), %xmm0
 ; MSVC-NEXT:    cmpltps %xmm1, %xmm0
 ; MSVC-NEXT:    popl %eax
 ; MSVC-NEXT:    retl
@@ -50,8 +50,8 @@ define <4 x i32> @foo(<4 x float> inreg %0, ...) nounwind {
 ; MINGW-NEXT:    movl %esp, %ebp
 ; MINGW-NEXT:    andl $-16, %esp
 ; MINGW-NEXT:    subl $16, %esp
-; MINGW-NEXT:    movaps 8(%ebp), %xmm0
 ; MINGW-NEXT:    movups 24(%ebp), %xmm1
+; MINGW-NEXT:    movaps 8(%ebp), %xmm0
 ; MINGW-NEXT:    cmpltps %xmm1, %xmm0
 ; MINGW-NEXT:    movl %ebp, %esp
 ; MINGW-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/variable-sized-darwin-bzero.ll b/llvm/test/CodeGen/X86/variable-sized-darwin-bzero.ll
index 9529a42355647..5a48d5f632da9 100644
--- a/llvm/test/CodeGen/X86/variable-sized-darwin-bzero.ll
+++ b/llvm/test/CodeGen/X86/variable-sized-darwin-bzero.ll
@@ -7,8 +7,8 @@ define void @foo(ptr %p, i64 %n) {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    calll ___bzero
 ; CHECK-NEXT:    addl $12, %esp
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
index bf37551712bc3..59c797734f777 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
@@ -113,11 +113,11 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm3, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
@@ -125,18 +125,18 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
-; SSE-32-NEXT:    cmoval %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmoval %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -267,11 +267,11 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm3, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
@@ -279,18 +279,18 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
-; SSE-32-NEXT:    cmovael %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovael %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -416,16 +416,16 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3]
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm3, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
@@ -433,18 +433,18 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
-; SSE-32-NEXT:    cmoval %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmoval %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -568,16 +568,16 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3]
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm3, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
@@ -585,18 +585,18 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
-; SSE-32-NEXT:    cmovael %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovael %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -1013,16 +1013,16 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3]
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm3, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
@@ -1030,18 +1030,18 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
-; SSE-32-NEXT:    cmovbl %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbl %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -1165,16 +1165,16 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    movl %esp, %ebp
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
-; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3]
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
 ; SSE-32-NEXT:    movaps %xmm3, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm2, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
@@ -1182,18 +1182,18 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
-; SSE-32-NEXT:    cmovbel %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbel %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -1322,11 +1322,11 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm3, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
@@ -1334,18 +1334,18 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
-; SSE-32-NEXT:    cmovbl %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbl %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -1476,11 +1476,11 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
 ; SSE-32-NEXT:    movaps %xmm2, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3]
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    movaps %xmm3, %xmm4
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
@@ -1488,18 +1488,18 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
 ; SSE-32-NEXT:    ucomiss %xmm4, %xmm6
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm4
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm5
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
-; SSE-32-NEXT:    cmovbel %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbel %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
 ; SSE-32-NEXT:    pand %xmm5, %xmm0
@@ -1905,18 +1905,18 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    cmoval %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmoval %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -2025,18 +2025,18 @@ define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    cmovael %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovael %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -2145,18 +2145,18 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    cmoval %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmoval %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -2263,18 +2263,18 @@ define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    cmovael %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    cmovael %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovael %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -2674,18 +2674,18 @@ define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    cmovbl %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbl %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -2792,18 +2792,18 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
-; SSE-32-NEXT:    cmovbel %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbel %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -2910,18 +2910,18 @@ define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    cmovbl %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    cmovbl %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbl %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
@@ -3030,18 +3030,18 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movapd 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    cmovbel %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
-; SSE-32-NEXT:    cmovbel %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmovbel %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
index 2eeee032c1c6e..4baef0f5880d2 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll
@@ -16,17 +16,17 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; SSE-32-NEXT:    andl $-16, %esp
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm4
-; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    movl $-1, %eax
+; SSE-32-NEXT:    xorl %ecx, %ecx
 ; SSE-32-NEXT:    comiss %xmm4, %xmm2
-; SSE-32-NEXT:    movl $-1, %ecx
 ; SSE-32-NEXT:    movl $0, %edx
-; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    cmoval %eax, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
 ; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
 ; SSE-32-NEXT:    comiss %xmm4, %xmm2
-; SSE-32-NEXT:    cmoval %ecx, %eax
-; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    cmoval %eax, %ecx
+; SSE-32-NEXT:    movd %ecx, %xmm2
 ; SSE-32-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -61,14 +61,14 @@ define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX-32-NEXT:    andl $-16, %esp
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX-32-NEXT:    xorl %eax, %eax
+; AVX-32-NEXT:    movl $-1, %eax
+; AVX-32-NEXT:    xorl %ecx, %ecx
 ; AVX-32-NEXT:    vcomiss 12(%ebp), %xmm3
-; AVX-32-NEXT:    movl $-1, %ecx
 ; AVX-32-NEXT:    movl $0, %edx
-; AVX-32-NEXT:    cmoval %ecx, %edx
+; AVX-32-NEXT:    cmoval %eax, %edx
 ; AVX-32-NEXT:    vcomiss 8(%ebp), %xmm2
-; AVX-32-NEXT:    cmoval %ecx, %eax
-; AVX-32-NEXT:    vmovd %eax, %xmm2
+; AVX-32-NEXT:    cmoval %eax, %ecx
+; AVX-32-NEXT:    vmovd %ecx, %xmm2
 ; AVX-32-NEXT:    vpinsrd $1, %edx, %xmm2, %xmm2
 ; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-32-NEXT:    movl %ebp, %esp
@@ -192,8 +192,8 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; SSE-32-NEXT:    subl $16, %esp
 ; SSE-32-NEXT:    movaps 8(%ebp), %xmm4
 ; SSE-32-NEXT:    xorl %eax, %eax
-; SSE-32-NEXT:    ucomiss %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm2
 ; SSE-32-NEXT:    movl $-1, %edx
 ; SSE-32-NEXT:    cmovnel %eax, %edx
 ; SSE-32-NEXT:    cmovpl %eax, %edx
@@ -241,8 +241,8 @@ define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1,
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
 ; AVX-32-NEXT:    xorl %eax, %eax
-; AVX-32-NEXT:    vucomiss 12(%ebp), %xmm3
 ; AVX-32-NEXT:    movl $-1, %ecx
+; AVX-32-NEXT:    vucomiss 12(%ebp), %xmm3
 ; AVX-32-NEXT:    movl $-1, %edx
 ; AVX-32-NEXT:    cmovnel %eax, %edx
 ; AVX-32-NEXT:    cmovpl %eax, %edx
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index 48a0b27a207f3..85645cd2342d0 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -577,8 +577,8 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -612,8 +612,8 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512F-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512F-32-NEXT:    andl $-8, %esp
 ; AVX512F-32-NEXT:    subl $16, %esp
-; AVX512F-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512F-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -647,8 +647,8 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512VL-32-NEXT:    andl $-8, %esp
 ; AVX512VL-32-NEXT:    subl $16, %esp
-; AVX512VL-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512VL-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -753,8 +753,8 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    movl 8(%ebp), %eax
 ; AVX-32-NEXT:    vmovaps (%eax), %xmm0
-; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -789,8 +789,8 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX512F-32-NEXT:    subl $16, %esp
 ; AVX512F-32-NEXT:    movl 8(%ebp), %eax
 ; AVX512F-32-NEXT:    vmovdqa (%eax), %xmm0
-; AVX512F-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512F-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512F-32-NEXT:    flds (%esp)
 ; AVX512F-32-NEXT:    fisttpll (%esp)
 ; AVX512F-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -825,8 +825,8 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX512VL-32-NEXT:    subl $16, %esp
 ; AVX512VL-32-NEXT:    movl 8(%ebp), %eax
 ; AVX512VL-32-NEXT:    vmovdqa (%eax), %xmm0
-; AVX512VL-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512VL-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT:    vmovd %xmm0, (%esp)
 ; AVX512VL-32-NEXT:    flds (%esp)
 ; AVX512VL-32-NEXT:    fisttpll (%esp)
 ; AVX512VL-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -1561,9 +1561,9 @@ define <2 x i32> @strict_vector_fptosi_v2f64_to_v2i32(<2 x double> %a) #0 {
 define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 {
 ; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32:
 ; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-32-NEXT:    movsd {{.*#+}} xmm3 = [2.147483648E+9,0.0E+0]
 ; SSE-32-NEXT:    comisd %xmm3, %xmm0
-; SSE-32-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-32-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-32-NEXT:    jb .LBB7_2
 ; SSE-32-NEXT:  # %bb.1:
@@ -1717,9 +1717,9 @@ define <2 x i32> @strict_vector_fptosi_v2f32_to_v2i32(<2 x float> %a) #0 {
 define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
 ; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i32:
 ; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    xorps %xmm2, %xmm2
 ; SSE-32-NEXT:    movss {{.*#+}} xmm3 = [2.14748365E+9,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-32-NEXT:    comiss %xmm3, %xmm0
-; SSE-32-NEXT:    xorps %xmm2, %xmm2
 ; SSE-32-NEXT:    xorps %xmm1, %xmm1
 ; SSE-32-NEXT:    jb .LBB9_2
 ; SSE-32-NEXT:  # %bb.1:
@@ -1770,8 +1770,8 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -2659,8 +2659,8 @@ define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
-; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
+; AVX-32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-32-NEXT:    flds (%esp)
 ; AVX-32-NEXT:    fisttpll (%esp)
 ; AVX-32-NEXT:    flds {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
index cd4ceca6716b1..9056858193ea5 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
@@ -272,9 +272,9 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
+; SSE-32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
-; SSE-32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
 ; SSE-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
 ; SSE-32-NEXT:    movd %xmm1, %eax
 ; SSE-32-NEXT:    shrl $31, %eax
@@ -341,9 +341,9 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; SSE41-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE41-32-NEXT:    andl $-8, %esp
 ; SSE41-32-NEXT:    subl $24, %esp
+; SSE41-32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
-; SSE41-32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
 ; SSE41-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
 ; SSE41-32-NEXT:    movd %xmm1, %eax
 ; SSE41-32-NEXT:    shrl $31, %eax
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index 7e446d5366387..e440d447abf1d 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -34,17 +34,29 @@ declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>,
 declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata)
 
 define <8 x float> @sitofp_v8i1_v8f32(<8 x i1> %x) #0 {
-; AVX1-LABEL: sitofp_v8i1_v8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; AVX1-32-LABEL: sitofp_v8i1_v8f32:
+; AVX1-32:       # %bb.0:
+; AVX1-32-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-32-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-32-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-32-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-32-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-32-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-32-NEXT:    retl
+;
+; AVX1-64-LABEL: sitofp_v8i1_v8f32:
+; AVX1-64:       # %bb.0:
+; AVX1-64-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-64-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-64-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-64-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-64-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX1-64-NEXT:    retq
 ;
 ; AVX2-LABEL: sitofp_v8i1_v8f32:
 ; AVX2:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vec3-setcc-crash.ll b/llvm/test/CodeGen/X86/vec3-setcc-crash.ll
index e80e709a93fb8..037722282ca86 100644
--- a/llvm/test/CodeGen/X86/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/X86/vec3-setcc-crash.ll
@@ -6,10 +6,10 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
 ; X86-LABEL: vec3_setcc_crash:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
+; X86-NEXT:    vmovdqa (%eax), %xmm0
 ; X86-NEXT:    vptestnmd %xmm0, %xmm0, %k1
 ; X86-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpextrd $2, %xmm0, 8(%eax)
 ; X86-NEXT:    vpextrd $1, %xmm0, 4(%eax)
 ; X86-NEXT:    vmovd %xmm0, (%eax)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index e229165be967a..40d06207bd69d 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -8,11 +8,11 @@ define <4 x i16> @func_16_32(ptr %a, ptr %b, ptr %c) nounwind {
 ; X86-LABEL: func_16_32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpaddw (%ecx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpaddw (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -36,16 +36,16 @@ define <4 x i16> @func_16_64(ptr %a, ptr %b, ptr %c) nounwind {
 ; X86-LABEL: func_16_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovaps (%edx), %ymm0
-; X86-NEXT:    vxorps (%ecx), %ymm0, %ymm0
+; X86-NEXT:    vmovaps (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vxorps (%eax), %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X86-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
 ; X86-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
 ; X86-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq %xmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -75,8 +75,8 @@ define <4 x i32> @func_32_64(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: func_32_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovaps (%ecx), %ymm0
+; X86-NEXT:    vmovaps (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vorps (%eax), %ymm0, %ymm0
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -103,8 +103,8 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: func_8_16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -129,8 +129,8 @@ define <4 x i8> @func_8_32(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: func_8_32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
+; X86-NEXT:    vmovdqa (%eax), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vpsubb (%eax), %xmm0, %xmm0
 ; X86-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X86-NEXT:    retl
@@ -153,13 +153,13 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: func_8_64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
-; X86-NEXT:    vmovdqa 16(%ecx), %xmm1
+; X86-NEXT:    vmovdqa (%eax), %xmm0
+; X86-NEXT:    vmovdqa 16(%eax), %xmm1
 ; X86-NEXT:    vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; X86-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; X86-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqa (%eax), %xmm1
 ; X86-NEXT:    vmovdqa 16(%eax), %xmm3
 ; X86-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll
index 4b70933334fb7..aa8bd1ce5f630 100644
--- a/llvm/test/CodeGen/X86/vec_extract-avx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll
@@ -116,10 +116,10 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
 ; X86-LABEL: legal_vzmovl_2i32_8i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -143,8 +143,8 @@ define void @legal_vzmovl_2i64_4i64(ptr %in, ptr %out) {
 ; X86-LABEL: legal_vzmovl_2i64_4i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -166,10 +166,10 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
 ; X86-LABEL: legal_vzmovl_2f32_8f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -193,8 +193,8 @@ define void @legal_vzmovl_2f64_4f64(ptr %in, ptr %out) {
 ; X86-LABEL: legal_vzmovl_2f64_4f64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vec_extract-sse4.ll b/llvm/test/CodeGen/X86/vec_extract-sse4.ll
index 1f384861b3735..b5f7f0c29d65d 100644
--- a/llvm/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-sse4.ll
@@ -6,8 +6,8 @@ define void @t1(ptr %R, ptr %P1) nounwind {
 ; X86-LABEL: t1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -42,9 +42,9 @@ define void @t3(ptr %R, ptr %P1) nounwind {
 ; X86-LABEL: t3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 12(%eax), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %eax, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t3:
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 9bd38db3e9c4c..a27ee6a105edc 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -6,9 +6,9 @@ define void @test1(ptr %F, ptr %f) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    addss %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -56,9 +56,9 @@ define void @test3(ptr %R, ptr %P1) nounwind {
 ; X86-LABEL: test3:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movaps (%ecx), %xmm0
+; X86-NEXT:    movaps (%eax), %xmm0
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movss %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -107,36 +107,30 @@ declare <2 x double> @foo()
 define i64 @pr150117(<31 x i8> %a0) nounwind {
 ; X86-LABEL: pr150117:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    shll $8, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    shll $8, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    shll $16, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    shll $24, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movd %esi, %xmm0
-; X86-NEXT:    pinsrw $2, %edx, %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $24, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movd %edx, %xmm0
+; X86-NEXT:    pinsrw $2, %eax, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    pinsrw $3, %ecx, %xmm0
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr150117:
diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll
index d0abd7d5f7512..b99d1533013e1 100644
--- a/llvm/test/CodeGen/X86/vec_fabs.ll
+++ b/llvm/test/CodeGen/X86/vec_fabs.ll
@@ -586,9 +586,9 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind {
 ;
 ; X86-AVX512VL-LABEL: fabs_v32f16:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VL-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
 ; X86-AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VL-NEXT:    vpandq (%eax), %zmm0, %zmm0
 ; X86-AVX512VL-NEXT:    retl
 ;
@@ -601,9 +601,9 @@ define <32 x half> @fabs_v32f16(ptr %p) nounwind {
 ;
 ; X86-AVX512VLDQ-LABEL: fabs_v32f16:
 ; X86-AVX512VLDQ:       # %bb.0:
-; X86-AVX512VLDQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VLDQ-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
 ; X86-AVX512VLDQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X86-AVX512VLDQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VLDQ-NEXT:    vpandq (%eax), %zmm0, %zmm0
 ; X86-AVX512VLDQ-NEXT:    retl
 ;
@@ -714,25 +714,25 @@ define void @PR70947(ptr %src, ptr %dst) nounwind {
 ; X86-SSE-LABEL: PR70947:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movups (%ecx), %xmm0
-; X86-SSE-NEXT:    movups 32(%ecx), %xmm1
+; X86-SSE-NEXT:    movups (%eax), %xmm0
+; X86-SSE-NEXT:    movups 32(%eax), %xmm1
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm2 = [NaN,NaN]
 ; X86-SSE-NEXT:    andps %xmm2, %xmm0
-; X86-SSE-NEXT:    andps %xmm2, %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    andps %xmm2, %xmm1
 ; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: PR70947:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
-; X86-AVX-NEXT:    vandps (%ecx), %ymm0, %ymm1
-; X86-AVX-NEXT:    vandps 32(%ecx), %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovups %ymm1, (%eax)
-; X86-AVX-NEXT:    vmovups %xmm0, 16(%eax)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vandps 32(%eax), %xmm0, %xmm1
+; X86-AVX-NEXT:    vandps (%eax), %ymm0, %ymm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vec_fcopysign.ll b/llvm/test/CodeGen/X86/vec_fcopysign.ll
index b506692ee5a65..94a759460a096 100644
--- a/llvm/test/CodeGen/X86/vec_fcopysign.ll
+++ b/llvm/test/CodeGen/X86/vec_fcopysign.ll
@@ -124,10 +124,10 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-LABEL: fcopysign_v8f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
 ; X86-SSE-NEXT:    movaps (%eax), %xmm1
 ; X86-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
 ; X86-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    orps %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
@@ -135,20 +135,20 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind {
 ; X86-AVX1-LABEL: fcopysign_v8f16:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1
 ; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vorps %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: fcopysign_v8f16:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-AVX2-NEXT:    vpand (%ecx), %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpand (%eax), %xmm0, %xmm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
 ; X86-AVX2-NEXT:    vpand (%eax), %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
@@ -157,8 +157,8 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind {
 ; X86-AVX512-LABEL: fcopysign_v8f16:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vmovdqa (%ecx), %xmm1
+; X86-AVX512-NEXT:    vmovdqa (%eax), %xmm1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879]
 ; X86-AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = mem ^ (xmm0 & (xmm1 ^ mem))
 ; X86-AVX512-NEXT:    retl
@@ -365,36 +365,36 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-LABEL: fcopysign_v16f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm2
-; X86-SSE-NEXT:    andnps (%ecx), %xmm2
-; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    andnps (%eax), %xmm2
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movaps (%ecx), %xmm0
 ; X86-SSE-NEXT:    andps %xmm1, %xmm0
 ; X86-SSE-NEXT:    orps %xmm2, %xmm0
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm2
-; X86-SSE-NEXT:    andnps 16(%ecx), %xmm2
-; X86-SSE-NEXT:    andps 16(%eax), %xmm1
+; X86-SSE-NEXT:    andnps 16(%eax), %xmm2
+; X86-SSE-NEXT:    andps 16(%ecx), %xmm1
 ; X86-SSE-NEXT:    orps %xmm2, %xmm1
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: fcopysign_v16f16:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    vmovups (%ecx), %ymm0
+; X86-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vmovups (%eax), %ymm1
 ; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
-; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: fcopysign_v16f16:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-AVX2-NEXT:    vpand (%ecx), %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpand (%eax), %ymm0, %ymm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
 ; X86-AVX2-NEXT:    vpand (%eax), %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -403,8 +403,8 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind {
 ; X86-AVX512-LABEL: fcopysign_v16f16:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vmovdqu (%ecx), %ymm1
+; X86-AVX512-NEXT:    vmovdqu (%eax), %ymm1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879]
 ; X86-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm0 & (ymm1 ^ mem))
 ; X86-AVX512-NEXT:    retl
@@ -642,60 +642,60 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-LABEL: fcopysign_v32f16:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
 ; X86-SSE-NEXT:    movaps %xmm3, %xmm1
-; X86-SSE-NEXT:    andnps (%ecx), %xmm1
-; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    andnps (%eax), %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movaps (%ecx), %xmm0
 ; X86-SSE-NEXT:    andps %xmm3, %xmm0
 ; X86-SSE-NEXT:    orps %xmm1, %xmm0
 ; X86-SSE-NEXT:    movaps %xmm3, %xmm2
-; X86-SSE-NEXT:    andnps 16(%ecx), %xmm2
-; X86-SSE-NEXT:    movaps 16(%eax), %xmm1
+; X86-SSE-NEXT:    andnps 16(%eax), %xmm2
+; X86-SSE-NEXT:    movaps 16(%ecx), %xmm1
 ; X86-SSE-NEXT:    andps %xmm3, %xmm1
 ; X86-SSE-NEXT:    orps %xmm2, %xmm1
 ; X86-SSE-NEXT:    movaps %xmm3, %xmm4
-; X86-SSE-NEXT:    andnps 32(%ecx), %xmm4
-; X86-SSE-NEXT:    movaps 32(%eax), %xmm2
+; X86-SSE-NEXT:    andnps 32(%eax), %xmm4
+; X86-SSE-NEXT:    movaps 32(%ecx), %xmm2
 ; X86-SSE-NEXT:    andps %xmm3, %xmm2
 ; X86-SSE-NEXT:    orps %xmm4, %xmm2
 ; X86-SSE-NEXT:    movaps %xmm3, %xmm4
-; X86-SSE-NEXT:    andnps 48(%ecx), %xmm4
-; X86-SSE-NEXT:    andps 48(%eax), %xmm3
+; X86-SSE-NEXT:    andnps 48(%eax), %xmm4
+; X86-SSE-NEXT:    andps 48(%ecx), %xmm3
 ; X86-SSE-NEXT:    orps %xmm4, %xmm3
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: fcopysign_v32f16:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; X86-AVX1-NEXT:    vandnps (%ecx), %ymm1, %ymm0
-; X86-AVX1-NEXT:    vandps (%eax), %ymm1, %ymm2
+; X86-AVX1-NEXT:    vandnps (%eax), %ymm1, %ymm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    vandps (%ecx), %ymm1, %ymm2
 ; X86-AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
-; X86-AVX1-NEXT:    vandnps 32(%ecx), %ymm1, %ymm2
-; X86-AVX1-NEXT:    vandps 32(%eax), %ymm1, %ymm1
+; X86-AVX1-NEXT:    vandnps 32(%eax), %ymm1, %ymm2
+; X86-AVX1-NEXT:    vandps 32(%ecx), %ymm1, %ymm1
 ; X86-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: fcopysign_v32f16:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
-; X86-AVX2-NEXT:    vpandn (%ecx), %ymm1, %ymm0
-; X86-AVX2-NEXT:    vpand (%eax), %ymm1, %ymm2
+; X86-AVX2-NEXT:    vpandn (%eax), %ymm1, %ymm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT:    vpand (%ecx), %ymm1, %ymm2
 ; X86-AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vpandn 32(%ecx), %ymm1, %ymm2
-; X86-AVX2-NEXT:    vpand 32(%eax), %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpandn 32(%eax), %ymm1, %ymm2
+; X86-AVX2-NEXT:    vpand 32(%ecx), %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: fcopysign_v32f16:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    vmovdqu64 (%ecx), %zmm1
+; X86-AVX512-NEXT:    vmovdqu64 (%eax), %zmm1
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879]
 ; X86-AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem))
 ; X86-AVX512-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll
index 64204a5c2123f..51f95ebd33fb9 100644
--- a/llvm/test/CodeGen/X86/vec_fneg.ll
+++ b/llvm/test/CodeGen/X86/vec_fneg.ll
@@ -578,9 +578,9 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind {
 ;
 ; X86-AVX512VL-LABEL: fneg_v32f16:
 ; X86-AVX512VL:       # %bb.0:
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VL-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; X86-AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VL-NEXT:    vpxorq (%eax), %zmm0, %zmm0
 ; X86-AVX512VL-NEXT:    retl
 ;
@@ -593,9 +593,9 @@ define <32 x half> @fneg_v32f16(ptr %p) nounwind {
 ;
 ; X86-AVX512VLDQ-LABEL: fneg_v32f16:
 ; X86-AVX512VLDQ:       # %bb.0:
-; X86-AVX512VLDQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VLDQ-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; X86-AVX512VLDQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X86-AVX512VLDQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512VLDQ-NEXT:    vpxorq (%eax), %zmm0, %zmm0
 ; X86-AVX512VLDQ-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll
index 29c77bf6ae312..4f8271b75aca6 100644
--- a/llvm/test/CodeGen/X86/vec_fpext.ll
+++ b/llvm/test/CodeGen/X86/vec_fpext.ll
@@ -78,25 +78,25 @@ define <4 x double> @fpext_8f32_to_4f64(<8 x float> %a) {
 define void @fpext_frommem(ptr %in, ptr %out) {
 ; X86-SSE-LABEL: fpext_frommem:
 ; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    cvtps2pd (%eax), %xmm0 # encoding: [0x0f,0x5a,0x00]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01]
 ; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX-LABEL: fpext_frommem:
 ; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vcvtps2pd (%eax), %xmm0 # encoding: [0xc5,0xf8,0x5a,0x00]
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX-NEXT:    vcvtps2pd (%ecx), %xmm0 # encoding: [0xc5,0xf8,0x5a,0x01]
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: fpext_frommem:
 ; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2pd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0x00]
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX512VL-NEXT:    vcvtps2pd (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0x01]
 ; X86-AVX512VL-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
@@ -127,28 +127,28 @@ entry:
 define void @fpext_frommem4(ptr %in, ptr %out) {
 ; X86-SSE-LABEL: fpext_frommem4:
 ; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    cvtps2pd (%eax), %xmm0 # encoding: [0x0f,0x5a,0x00]
+; X86-SSE-NEXT:    cvtps2pd 8(%eax), %xmm1 # encoding: [0x0f,0x5a,0x48,0x08]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01]
-; X86-SSE-NEXT:    cvtps2pd 8(%ecx), %xmm1 # encoding: [0x0f,0x5a,0x49,0x08]
 ; X86-SSE-NEXT:    movups %xmm1, 16(%eax) # encoding: [0x0f,0x11,0x48,0x10]
 ; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX-LABEL: fpext_frommem4:
 ; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vcvtps2pd (%eax), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x00]
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX-NEXT:    vcvtps2pd (%ecx), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x01]
 ; X86-AVX-NEXT:    vmovups %ymm0, (%eax) # encoding: [0xc5,0xfc,0x11,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: fpext_frommem4:
 ; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2pd (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x00]
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX512VL-NEXT:    vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01]
 ; X86-AVX512VL-NEXT:    vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
@@ -184,12 +184,12 @@ entry:
 define void @fpext_frommem8(ptr %in, ptr %out) {
 ; X86-SSE-LABEL: fpext_frommem8:
 ; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    cvtps2pd 8(%eax), %xmm0 # encoding: [0x0f,0x5a,0x40,0x08]
+; X86-SSE-NEXT:    cvtps2pd (%eax), %xmm1 # encoding: [0x0f,0x5a,0x08]
+; X86-SSE-NEXT:    cvtps2pd 24(%eax), %xmm2 # encoding: [0x0f,0x5a,0x50,0x18]
+; X86-SSE-NEXT:    cvtps2pd 16(%eax), %xmm3 # encoding: [0x0f,0x5a,0x58,0x10]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    cvtps2pd 8(%ecx), %xmm0 # encoding: [0x0f,0x5a,0x41,0x08]
-; X86-SSE-NEXT:    cvtps2pd (%ecx), %xmm1 # encoding: [0x0f,0x5a,0x09]
-; X86-SSE-NEXT:    cvtps2pd 24(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x18]
-; X86-SSE-NEXT:    cvtps2pd 16(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x10]
 ; X86-SSE-NEXT:    movups %xmm3, 32(%eax) # encoding: [0x0f,0x11,0x58,0x20]
 ; X86-SSE-NEXT:    movups %xmm2, 48(%eax) # encoding: [0x0f,0x11,0x50,0x30]
 ; X86-SSE-NEXT:    movups %xmm1, (%eax) # encoding: [0x0f,0x11,0x08]
@@ -198,10 +198,10 @@ define void @fpext_frommem8(ptr %in, ptr %out) {
 ;
 ; X86-AVX-LABEL: fpext_frommem8:
 ; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vcvtps2pd (%eax), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x00]
+; X86-AVX-NEXT:    vcvtps2pd 16(%eax), %ymm1 # encoding: [0xc5,0xfc,0x5a,0x48,0x10]
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX-NEXT:    vcvtps2pd (%ecx), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x01]
-; X86-AVX-NEXT:    vcvtps2pd 16(%ecx), %ymm1 # encoding: [0xc5,0xfc,0x5a,0x49,0x10]
 ; X86-AVX-NEXT:    vmovups %ymm1, 32(%eax) # encoding: [0xc5,0xfc,0x11,0x48,0x20]
 ; X86-AVX-NEXT:    vmovups %ymm0, (%eax) # encoding: [0xc5,0xfc,0x11,0x00]
 ; X86-AVX-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -209,9 +209,9 @@ define void @fpext_frommem8(ptr %in, ptr %out) {
 ;
 ; X86-AVX512VL-LABEL: fpext_frommem8:
 ; X86-AVX512VL:       # %bb.0: # %entry
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vcvtps2pd (%eax), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x00]
 ; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
-; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
-; X86-AVX512VL-NEXT:    vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01]
 ; X86-AVX512VL-NEXT:    vmovups %zmm0, (%eax) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00]
 ; X86-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-AVX512VL-NEXT:    retl # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/vec_fptrunc.ll b/llvm/test/CodeGen/X86/vec_fptrunc.ll
index 5b2dafaf3ac28..ba24f3d87b122 100644
--- a/llvm/test/CodeGen/X86/vec_fptrunc.ll
+++ b/llvm/test/CodeGen/X86/vec_fptrunc.ll
@@ -8,16 +8,16 @@ define void @fptrunc_frommem2(ptr %in, ptr %out) {
 ; X86-SSE-LABEL: fptrunc_frommem2:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    cvtpd2ps (%ecx), %xmm0
+; X86-SSE-NEXT:    cvtpd2ps (%eax), %xmm0
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movlpd %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: fptrunc_frommem2:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vcvtpd2psx (%ecx), %xmm0
+; X86-AVX-NEXT:    vcvtpd2psx (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovlpd %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -43,18 +43,18 @@ define void @fptrunc_frommem4(ptr %in, ptr %out) {
 ; X86-SSE-LABEL: fptrunc_frommem4:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    cvtpd2ps 16(%ecx), %xmm0
-; X86-SSE-NEXT:    cvtpd2ps (%ecx), %xmm1
+; X86-SSE-NEXT:    cvtpd2ps 16(%eax), %xmm0
+; X86-SSE-NEXT:    cvtpd2ps (%eax), %xmm1
 ; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movupd %xmm1, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: fptrunc_frommem4:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vcvtpd2psy (%ecx), %xmm0
+; X86-AVX-NEXT:    vcvtpd2psy (%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovupd %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
@@ -82,23 +82,23 @@ define void @fptrunc_frommem8(ptr %in, ptr %out) {
 ; X86-SSE-LABEL: fptrunc_frommem8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    cvtpd2ps 16(%ecx), %xmm0
-; X86-SSE-NEXT:    cvtpd2ps (%ecx), %xmm1
+; X86-SSE-NEXT:    cvtpd2ps 48(%eax), %xmm0
+; X86-SSE-NEXT:    cvtpd2ps 32(%eax), %xmm1
 ; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X86-SSE-NEXT:    cvtpd2ps 48(%ecx), %xmm0
-; X86-SSE-NEXT:    cvtpd2ps 32(%ecx), %xmm2
+; X86-SSE-NEXT:    cvtpd2ps 16(%eax), %xmm0
+; X86-SSE-NEXT:    cvtpd2ps (%eax), %xmm2
 ; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X86-SSE-NEXT:    movupd %xmm2, 16(%eax)
-; X86-SSE-NEXT:    movupd %xmm1, (%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movupd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    movupd %xmm2, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: fptrunc_frommem8:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vcvtpd2psy (%ecx), %xmm0
-; X86-AVX-NEXT:    vcvtpd2psy 32(%ecx), %xmm1
+; X86-AVX-NEXT:    vcvtpd2psy (%eax), %xmm0
+; X86-AVX-NEXT:    vcvtpd2psy 32(%eax), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovupd %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovupd %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vec_ins_extract-1.ll b/llvm/test/CodeGen/X86/vec_ins_extract-1.ll
index cf70d5d7f1edf..a0292e177be9c 100644
--- a/llvm/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/llvm/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -12,8 +12,8 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movl $76, (%esp,%eax,4)
 ; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl %ebp, %esp
@@ -40,10 +40,10 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movl $76, %ecx
 ; X32-NEXT:    pinsrd $0, %ecx, %xmm0
 ; X32-NEXT:    movdqa %xmm0, (%esp)
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movl (%esp,%eax,4), %eax
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -70,8 +70,8 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movdqa %xmm0, (%esp)
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    pinsrd $0, (%esp,%eax,4), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -96,8 +96,8 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-16, %esp
 ; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movaps %xmm0, (%esp)
+; X32-NEXT:    andl $3, %eax
 ; X32-NEXT:    movss %xmm0, (%esp,%eax,4)
 ; X32-NEXT:    movaps (%esp), %xmm0
 ; X32-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/vec_insert-4.ll b/llvm/test/CodeGen/X86/vec_insert-4.ll
index 0182391eaf84f..f77f99377816b 100644
--- a/llvm/test/CodeGen/X86/vec_insert-4.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-4.ll
@@ -6,10 +6,10 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind  {
 ; X86-LABEL: f:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $7, %eax
 ; X86-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movaps %xmm0, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $7, %eax
 ; X86-NEXT:    movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
 ; X86-NEXT:    movaps (%esp), %xmm0
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index ddde9ecb7c0fd..daf97dfeaacbc 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -8,10 +8,10 @@
 define void  @t1(i32 %a, ptr %P) nounwind {
 ; X86-LABEL: t1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; X86-NEXT:    pslld $12, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vec_insert-8.ll b/llvm/test/CodeGen/X86/vec_insert-8.ll
index aa3364b31d66f..71f473d08f0f8 100644
--- a/llvm/test/CodeGen/X86/vec_insert-8.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-8.ll
@@ -11,10 +11,10 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movaps %xmm0, (%esp)
 ; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    andl $3, %eax
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movaps %xmm0, (%esp)
 ; X86-NEXT:    movl %ecx, (%esp,%eax,4)
 ; X86-NEXT:    movaps (%esp), %xmm0
 ; X86-NEXT:    movl %ebp, %esp
@@ -41,9 +41,9 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movaps %xmm0, (%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    movaps %xmm0, (%esp)
 ; X86-NEXT:    movl (%esp,%eax,4), %eax
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/vec_logical.ll b/llvm/test/CodeGen/X86/vec_logical.ll
index 6935f7c7bfbbf..9ca68bfe4180d 100644
--- a/llvm/test/CodeGen/X86/vec_logical.ll
+++ b/llvm/test/CodeGen/X86/vec_logical.ll
@@ -58,19 +58,19 @@ entry:
 define void @t3(<4 x float> %a, <4 x float> %b, ptr %c, ptr %d) {
 ; SSE-LABEL: t3:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; SSE-NEXT:    andnps %xmm1, %xmm0
-; SSE-NEXT:    orps (%ecx), %xmm0
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    orps (%eax), %xmm0
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SSE-NEXT:    movaps %xmm0, (%eax)
 ; SSE-NEXT:    retl
 ;
 ; AVX-LABEL: t3:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX-NEXT:    vandnps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vorps (%ecx), %xmm0, %xmm0
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vorps (%eax), %xmm0, %xmm0
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; AVX-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/vec_set.ll b/llvm/test/CodeGen/X86/vec_set.ll
index da61839299498..fdffce3b89a76 100644
--- a/llvm/test/CodeGen/X86/vec_set.ll
+++ b/llvm/test/CodeGen/X86/vec_set.ll
@@ -5,7 +5,6 @@
 define void @test(ptr %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
 ; X86-LABEL: test:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -21,6 +20,7 @@ define void @test(ptr %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5,
 ; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm3, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index 2ab00ea96ada1..efc9a4a259598 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -175,8 +175,8 @@ define <2 x i64> @test16() {
 define <4 x i32> @test17(<4 x i32> %a0, ptr %dummy) {
 ; X86-LABEL: test17:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -194,8 +194,8 @@ define <4 x i32> @test17(<4 x i32> %a0, ptr %dummy) {
 define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) {
 ; X86-LABEL: test18:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -252,17 +252,29 @@ define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){
 }
 
 define i32 @extelt1_add_psrai_v4i32_uses(<4 x i32> %x, <4 x i32> %y){
-; CHECK-LABEL: extelt1_add_psrai_v4i32_uses:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; CHECK-NEXT:    movd %xmm1, %ecx
-; CHECK-NEXT:    addl $3, %ecx
-; CHECK-NEXT:    movd %ecx, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    psrad %xmm1, %xmm0
-; CHECK-NEXT:    movd %xmm0, %eax
-; CHECK-NEXT:    imull %ecx, %eax
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: extelt1_add_psrai_v4i32_uses:
+; X86:       # %bb.0:
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X86-NEXT:    movd %xmm1, %ecx
+; X86-NEXT:    addl $3, %ecx
+; X86-NEXT:    movd %ecx, %xmm1
+; X86-NEXT:    psrad %xmm1, %xmm0
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: extelt1_add_psrai_v4i32_uses:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    addl $3, %ecx
+; X64-NEXT:    movd %ecx, %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT:    psrad %xmm1, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    retq
   %ext = extractelement <4 x i32> %y, i64 1
   %bo = add i32 %ext, 3
   %r = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 %bo)
@@ -272,14 +284,23 @@ define i32 @extelt1_add_psrai_v4i32_uses(<4 x i32> %x, <4 x i32> %y){
 }
 
 define <4 x i32> @extelt0_twice_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z){
-; CHECK-LABEL: extelt0_twice_sub_pslli_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    movd %xmm2, %ecx
-; CHECK-NEXT:    subl %ecx, %eax
-; CHECK-NEXT:    movd %eax, %xmm1
-; CHECK-NEXT:    pslld %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: extelt0_twice_sub_pslli_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movd %xmm2, %eax
+; X86-NEXT:    movd %xmm1, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movd %ecx, %xmm1
+; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: extelt0_twice_sub_pslli_v4i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    movd %eax, %xmm1
+; X64-NEXT:    pslld %xmm1, %xmm0
+; X64-NEXT:    retq
   %ext1 = extractelement <4 x i32> %y, i64 0
   %ext2 = extractelement <4 x i32> %z, i64 0
   %bo = sub i32 %ext1, %ext2
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index e73d345d0fcd4..9785b5f8291f1 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -375,8 +375,8 @@ define <4 x float> @double_fold(ptr %x, <4 x float> %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    movaps %xmm0, %xmm2
-; X86-NEXT:    minss %xmm1, %xmm2
-; X86-NEXT:    maxss %xmm1, %xmm0
+; X86-NEXT:    maxss %xmm1, %xmm2
+; X86-NEXT:    minss %xmm1, %xmm0
 ; X86-NEXT:    addps %xmm2, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -393,9 +393,9 @@ define <4 x float> @double_fold(ptr %x, <4 x float> %y) {
 ; X86_AVX:       ## %bb.0: ## %entry
 ; X86_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86_AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86_AVX-NEXT:    vminss %xmm1, %xmm0, %xmm2
-; X86_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
-; X86_AVX-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; X86_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm2
+; X86_AVX-NEXT:    vminss %xmm1, %xmm0, %xmm0
+; X86_AVX-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; X86_AVX-NEXT:    retl
 ;
 ; X64_AVX-LABEL: double_fold:
diff --git a/llvm/test/CodeGen/X86/vec_zero.ll b/llvm/test/CodeGen/X86/vec_zero.ll
index 6e798e8af53f6..85ddb0c4992c2 100644
--- a/llvm/test/CodeGen/X86/vec_zero.ll
+++ b/llvm/test/CodeGen/X86/vec_zero.ll
@@ -49,8 +49,8 @@ define void @bar(ptr %P) {
 define void @untyped_zero(ptr %p) {
 ; X86-LABEL: untyped_zero:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll
index d4357aeb2e1de..5c33e56c78ee6 100644
--- a/llvm/test/CodeGen/X86/vec_zero_cse.ll
+++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll
@@ -13,10 +13,10 @@
 define void @test1() {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $0, M1+4
-; X86-NEXT:    movl $0, M1
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movsd %xmm0, M2
+; X86-NEXT:    movl $0, M1+4
+; X86-NEXT:    movl $0, M1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test1:
@@ -32,10 +32,10 @@ define void @test1() {
 define void @test2() {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $-1, M1+4
-; X86-NEXT:    movl $-1, M1
 ; X86-NEXT:    pcmpeqd %xmm0, %xmm0
 ; X86-NEXT:    movq %xmm0, M2
+; X86-NEXT:    movl $-1, M1+4
+; X86-NEXT:    movl $-1, M1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
@@ -52,8 +52,8 @@ define void @test3() {
 ; X86-LABEL: test3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movaps %xmm0, S1
 ; X86-NEXT:    movaps %xmm0, S2
+; X86-NEXT:    movaps %xmm0, S1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
@@ -71,8 +71,8 @@ define void @test4() {
 ; X86-LABEL: test4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pcmpeqd %xmm0, %xmm0
-; X86-NEXT:    movdqa %xmm0, S1
 ; X86-NEXT:    movdqa %xmm0, S2
+; X86-NEXT:    movdqa %xmm0, S1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
index 889ab6a0818e2..2ebb04ae9e842 100644
--- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll
@@ -11,26 +11,26 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X86-SSE-NEXT:    movl %esp, %ebp
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $272, %esp # imm = 0x110
+; X86-SSE-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movdqa 72(%ebp), %xmm1
+; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; X86-SSE-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movl 88(%ebp), %ecx
-; X86-SSE-NEXT:    movdqa 72(%ebp), %xmm0
-; X86-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSE-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movaps %xmm1, (%esp)
-; X86-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-SSE-NEXT:    andl $31, %eax
 ; X86-SSE-NEXT:    movl 128(%esp,%eax,4), %eax
@@ -66,17 +66,17 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X86-AVX-NEXT:    movl %esp, %ebp
 ; X86-AVX-NEXT:    andl $-32, %esp
 ; X86-AVX-NEXT:    subl $288, %esp # imm = 0x120
-; X86-AVX-NEXT:    movl 40(%ebp), %ecx
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; X86-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %ymm1, (%esp)
 ; X86-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovaps %ymm0, (%esp)
+; X86-AVX-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 40(%ebp), %ecx
 ; X86-AVX-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-AVX-NEXT:    andl $31, %eax
 ; X86-AVX-NEXT:    movl 128(%esp,%eax,4), %eax
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index fe36fbf626d70..f3d468955aceb 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1447,9 +1447,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE2-NEXT:    psllw %xmm2, %xmm3
 ; X86-SSE2-NEXT:    psrlw $8, %xmm3
@@ -1661,24 +1661,24 @@ define void @sink_splatvar(ptr %p, i32 %shift_amt) {
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-SSE2-NEXT:    .cfi_offset %esi, -8
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    xorl %ecx, %ecx
+; X86-SSE2-NEXT:    xorl %eax, %eax
 ; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    xorl %edx, %edx
 ; X86-SSE2-NEXT:    .p2align 4
 ; X86-SSE2-NEXT:  .LBB8_1: # %loop
 ; X86-SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-SSE2-NEXT:    movdqu (%eax,%ecx,4), %xmm1
+; X86-SSE2-NEXT:    movdqu (%ecx,%eax,4), %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
 ; X86-SSE2-NEXT:    psllq %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
 ; X86-SSE2-NEXT:    psllq %xmm0, %xmm1
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
-; X86-SSE2-NEXT:    movups %xmm1, (%eax,%ecx,4)
-; X86-SSE2-NEXT:    addl $4, %ecx
+; X86-SSE2-NEXT:    movups %xmm1, (%ecx,%eax,4)
+; X86-SSE2-NEXT:    addl $4, %eax
 ; X86-SSE2-NEXT:    adcl $0, %edx
-; X86-SSE2-NEXT:    movl %ecx, %esi
+; X86-SSE2-NEXT:    movl %eax, %esi
 ; X86-SSE2-NEXT:    xorl $256, %esi # imm = 0x100
 ; X86-SSE2-NEXT:    orl %edx, %esi
 ; X86-SSE2-NEXT:    jne .LBB8_1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index c6698be9bc15c..9b8a9a02ffb33 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1186,9 +1186,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    psllw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psrlw $8, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 0462d0cd6a7af..09424818aa339 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1592,9 +1592,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE2-NEXT:    psrlw %xmm2, %xmm4
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 4c271a5ee797c..2a17f7aaa217a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1249,9 +1249,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    psrlw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll
index b4cffcd171b33..fcda7126d9064 100644
--- a/llvm/test/CodeGen/X86/vector-gep.ll
+++ b/llvm/test/CodeGen/X86/vector-gep.ll
@@ -121,88 +121,70 @@ define <64 x ptr> @AGEP9(ptr %param, <64 x i32> %off) nounwind {
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-32, %esp
-; CHECK-NEXT:    subl $160, %esp
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm3
-; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm5
-; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
-; CHECK-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 40(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 56(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa %xmm0, (%esp) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm2
-; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm1
-; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
-; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm6
-; CHECK-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
-; CHECK-NEXT:    vpaddd %xmm6, %xmm5, %xmm6
-; CHECK-NEXT:    vmovdqa 152(%ebp), %xmm7
-; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
-; CHECK-NEXT:    vpaddd %xmm7, %xmm5, %xmm7
+; CHECK-NEXT:    subl $32, %esp
 ; CHECK-NEXT:    vmovdqa 168(%ebp), %xmm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; CHECK-NEXT:    vmovdqa 184(%ebp), %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm5
+; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm3
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
 ; CHECK-NEXT:    movl 8(%ebp), %eax
-; CHECK-NEXT:    vmovdqa %xmm3, 240(%eax)
+; CHECK-NEXT:    vmovdqa %xmm5, 240(%eax)
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
 ; CHECK-NEXT:    vmovdqa %xmm4, 224(%eax)
-; CHECK-NEXT:    vmovdqa %xmm7, 208(%eax)
-; CHECK-NEXT:    vmovdqa %xmm6, 192(%eax)
-; CHECK-NEXT:    vmovdqa %xmm0, 176(%eax)
-; CHECK-NEXT:    vmovdqa %xmm1, 160(%eax)
-; CHECK-NEXT:    vmovdqa %xmm2, 144(%eax)
-; CHECK-NEXT:    vmovaps (%esp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 128(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 112(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 96(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 80(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 64(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 48(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 32(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 16(%eax)
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, (%eax)
+; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm4
+; CHECK-NEXT:    vmovdqa 152(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vmovdqa %xmm5, 208(%eax)
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm4, 192(%eax)
+; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm4
+; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vmovdqa %xmm5, 176(%eax)
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm4, 160(%eax)
+; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm4
+; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vmovdqa %xmm5, 144(%eax)
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm4, 128(%eax)
+; CHECK-NEXT:    vmovdqa 40(%ebp), %xmm4
+; CHECK-NEXT:    vmovdqa 56(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm5, %xmm5
+; CHECK-NEXT:    vpaddd %xmm5, %xmm3, %xmm5
+; CHECK-NEXT:    vmovdqa %xmm5, 112(%eax)
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm4, 96(%eax)
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm3, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm4, 80(%eax)
+; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vmovdqa %xmm2, 64(%eax)
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vmovdqa %xmm2, 48(%eax)
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vmovdqa %xmm1, 32(%eax)
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vmovdqa %xmm1, 16(%eax)
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, (%eax)
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
index 1d9977e6d6287..ed40441525a5e 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -24,7 +24,6 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_udiv7_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
 ; X86-NEXT:    movdqa %xmm0, %xmm2
@@ -39,6 +38,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-NEXT:    psrld $1, %xmm0
 ; X86-NEXT:    paddd %xmm2, %xmm0
 ; X86-NEXT:    psrld $2, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -74,7 +74,6 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_urem7_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
 ; X86-NEXT:    movdqa %xmm0, %xmm2
@@ -94,6 +93,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-NEXT:    pslld $3, %xmm2
 ; X86-NEXT:    psubd %xmm2, %xmm1
 ; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -130,7 +130,6 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_sdiv7_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u]
 ; X86-NEXT:    movdqa %xmm1, %xmm0
@@ -151,6 +150,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-NEXT:    psrld $31, %xmm1
 ; X86-NEXT:    psrad $2, %xmm0
 ; X86-NEXT:    paddd %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -191,7 +191,6 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_srem7_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u]
 ; X86-NEXT:    movdqa %xmm0, %xmm1
@@ -216,6 +215,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-NEXT:    pslld $3, %xmm2
 ; X86-NEXT:    psubd %xmm2, %xmm1
 ; X86-NEXT:    paddd %xmm0, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -235,9 +235,9 @@ define void @test_udiv_pow2_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_udiv_pow2_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    psrld $3, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -257,9 +257,9 @@ define void @test_urem_pow2_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_urem_pow2_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movlps %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -283,13 +283,13 @@ define void @test_sdiv_pow2_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_sdiv_pow2_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrad $31, %xmm1
 ; X86-NEXT:    psrld $29, %xmm1
 ; X86-NEXT:    paddd %xmm0, %xmm1
 ; X86-NEXT:    psrad $3, %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -309,9 +309,9 @@ define void @test_srem_pow2_v2i32(ptr %x, ptr %y) nounwind {
 ; X86-LABEL: test_srem_pow2_v2i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    psrld $3, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
@@ -345,27 +345,25 @@ define void @test_udiv_v2i32(ptr %x, ptr %y, ptr %z) nounwind {
 ;
 ; X86-LABEL: test_udiv_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-NEXT:    movq %xmm2, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movq %xmm2, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
   %b = load <2 x i32>, ptr %y
@@ -399,27 +397,25 @@ define void @test_urem_v2i32(ptr %x, ptr %y, ptr %z) nounwind {
 ;
 ; X86-LABEL: test_urem_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %edx, %xmm2
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    divl %esi
+; X86-NEXT:    divl %ecx
 ; X86-NEXT:    movd %edx, %xmm0
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-NEXT:    movq %xmm2, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movq %xmm2, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
   %b = load <2 x i32>, ptr %y
@@ -453,27 +449,25 @@ define void @test_sdiv_v2i32(ptr %x, ptr %y, ptr %z) nounwind {
 ;
 ; X86-LABEL: test_sdiv_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-NEXT:    movq %xmm2, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movq %xmm2, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
   %b = load <2 x i32>, ptr %y
@@ -507,27 +501,25 @@ define void @test_srem_v2i32(ptr %x, ptr %y, ptr %z) nounwind {
 ;
 ; X86-LABEL: test_srem_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    movd %xmm0, %eax
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm2
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X86-NEXT:    movd %xmm0, %eax
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-NEXT:    movd %xmm1, %esi
+; X86-NEXT:    movd %xmm1, %ecx
 ; X86-NEXT:    cltd
-; X86-NEXT:    idivl %esi
+; X86-NEXT:    idivl %ecx
 ; X86-NEXT:    movd %eax, %xmm0
 ; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X86-NEXT:    movq %xmm2, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movq %xmm2, (%eax)
 ; X86-NEXT:    retl
   %a = load <2 x i32>, ptr %x
   %b = load <2 x i32>, ptr %y
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 963816c4e0e2e..e8b912ad59f53 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -23,7 +23,8 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) nounwind  {
 ;
 ; X86-NOX87-LABEL: llrint_v1i64_v1f32:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
@@ -52,54 +53,43 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    flds 12(%ebp)
 ; X86-NEXT:    fistpll (%esp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl (%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-NOX87-LABEL: llrint_v2i64_v2f32:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 12(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-NOX87-NEXT:    movl %edi, (%esi)
+; X86-NOX87-NEXT:    calll llrintf
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
+; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    popl %esi
-; X86-NOX87-NEXT:    popl %edi
-; X86-NOX87-NEXT:    popl %ebx
-; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl $4
 ;
 ; SSE-LABEL: llrint_v2i64_v2f32:
@@ -137,46 +127,34 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    flds 24(%ebp)
-; X86-NEXT:    flds 20(%ebp)
-; X86-NEXT:    flds 16(%ebp)
-; X86-NEXT:    flds 12(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    flds 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, 20(%eax)
-; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -186,41 +164,35 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    subl $12, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    subl $8, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, %edi
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, %ebx
 ; X86-NOX87-NEXT:    movl %edx, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 28(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 24(%esi)
 ; X86-NOX87-NEXT:    movl %ebp, 20(%esi)
 ; X86-NOX87-NEXT:    movl %ebx, 16(%esi)
 ; X86-NOX87-NEXT:    movl %edi, 12(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-NOX87-NEXT:    calll llrintf
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    addl $12, %esp
+; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
@@ -297,86 +269,58 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $120, %esp
-; X86-NEXT:    flds 12(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 16(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 20(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 24(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 28(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 32(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 36(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    flds 40(%ebp)
 ; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, 60(%eax)
+; X86-NEXT:    movl %edx, 60(%eax)
 ; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    flds 36(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, 52(%eax)
-; X86-NEXT:    movl %esi, 48(%eax)
-; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    flds 32(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 44(%eax)
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 36(%eax)
 ; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 24(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
 ; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -386,78 +330,71 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    subl $44, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    subl $40, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %edx, %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %eax, %ebp
+; X86-NOX87-NEXT:    movl %edx, %esi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %edx, 60(%esi)
-; X86-NOX87-NEXT:    movl %eax, 56(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 52(%esi)
-; X86-NOX87-NEXT:    movl %edi, 48(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 44(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 40(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 36(%esi)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOX87-NEXT:    movl %edx, 60(%ebx)
+; X86-NOX87-NEXT:    movl %eax, 56(%ebx)
+; X86-NOX87-NEXT:    movl %esi, 52(%ebx)
+; X86-NOX87-NEXT:    movl %ebp, 48(%ebx)
+; X86-NOX87-NEXT:    movl %edi, 44(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 32(%esi)
+; X86-NOX87-NEXT:    movl %eax, 40(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 28(%esi)
+; X86-NOX87-NEXT:    movl %eax, 36(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 24(%esi)
+; X86-NOX87-NEXT:    movl %eax, 32(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 20(%esi)
+; X86-NOX87-NEXT:    movl %eax, 28(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 16(%esi)
+; X86-NOX87-NEXT:    movl %eax, 24(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
+; X86-NOX87-NEXT:    movl %eax, 20(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-NOX87-NEXT:    movl %eax, 16(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, 12(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    addl $44, %esp
+; X86-NOX87-NEXT:    movl %eax, 8(%ebx)
+; X86-NOX87-NEXT:    calll llrintf
+; X86-NOX87-NEXT:    movl %edx, 4(%ebx)
+; X86-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-NOX87-NEXT:    movl %ebx, %eax
+; X86-NOX87-NEXT:    addl $40, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
@@ -582,166 +519,106 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $248, %esp
-; X86-NEXT:    flds 12(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 16(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 20(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 24(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 28(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 32(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 36(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 40(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 44(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 48(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 52(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 56(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 60(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 64(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    flds 68(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    subl $128, %esp
 ; X86-NEXT:    flds 72(%ebp)
 ; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, 124(%eax)
+; X86-NEXT:    movl %edx, 124(%eax)
 ; X86-NEXT:    movl %ecx, 120(%eax)
+; X86-NEXT:    flds 68(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, 116(%eax)
-; X86-NEXT:    movl %esi, 112(%eax)
-; X86-NEXT:    movl %edi, 108(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 112(%eax)
+; X86-NEXT:    flds 64(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 108(%eax)
 ; X86-NEXT:    movl %ecx, 104(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 100(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 60(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 100(%eax)
 ; X86-NEXT:    movl %ecx, 96(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 92(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 56(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 92(%eax)
 ; X86-NEXT:    movl %ecx, 88(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 84(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 52(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 84(%eax)
 ; X86-NEXT:    movl %ecx, 80(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 76(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 48(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 76(%eax)
 ; X86-NEXT:    movl %ecx, 72(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 68(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 44(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 68(%eax)
 ; X86-NEXT:    movl %ecx, 64(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 40(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 60(%eax)
 ; X86-NEXT:    movl %ecx, 56(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 36(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 52(%eax)
 ; X86-NEXT:    movl %ecx, 48(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 32(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 44(%eax)
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 36(%eax)
 ; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 24(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
 ; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 16(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    flds 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -751,150 +628,143 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    subl $108, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    subl $104, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %edx, %ebx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NOX87-NEXT:    movl %eax, %esi
 ; X86-NOX87-NEXT:    movl %edx, %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    movl %edx, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll llrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %edx, 124(%esi)
-; X86-NOX87-NEXT:    movl %eax, 120(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 116(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 112(%esi)
-; X86-NOX87-NEXT:    movl %edi, 108(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 104(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 100(%esi)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NOX87-NEXT:    movl %edx, 124(%ebp)
+; X86-NOX87-NEXT:    movl %eax, 120(%ebp)
+; X86-NOX87-NEXT:    movl %edi, 116(%ebp)
+; X86-NOX87-NEXT:    movl %esi, 112(%ebp)
+; X86-NOX87-NEXT:    movl %ebx, 108(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 96(%esi)
+; X86-NOX87-NEXT:    movl %eax, 104(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 92(%esi)
+; X86-NOX87-NEXT:    movl %eax, 100(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 88(%esi)
+; X86-NOX87-NEXT:    movl %eax, 96(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 84(%esi)
+; X86-NOX87-NEXT:    movl %eax, 92(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 80(%esi)
+; X86-NOX87-NEXT:    movl %eax, 88(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 76(%esi)
+; X86-NOX87-NEXT:    movl %eax, 84(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 72(%esi)
+; X86-NOX87-NEXT:    movl %eax, 80(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 68(%esi)
+; X86-NOX87-NEXT:    movl %eax, 76(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 64(%esi)
+; X86-NOX87-NEXT:    movl %eax, 72(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 60(%esi)
+; X86-NOX87-NEXT:    movl %eax, 68(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 56(%esi)
+; X86-NOX87-NEXT:    movl %eax, 64(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 52(%esi)
+; X86-NOX87-NEXT:    movl %eax, 60(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 48(%esi)
+; X86-NOX87-NEXT:    movl %eax, 56(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 44(%esi)
+; X86-NOX87-NEXT:    movl %eax, 52(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 40(%esi)
+; X86-NOX87-NEXT:    movl %eax, 48(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 36(%esi)
+; X86-NOX87-NEXT:    movl %eax, 44(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 32(%esi)
+; X86-NOX87-NEXT:    movl %eax, 40(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 28(%esi)
+; X86-NOX87-NEXT:    movl %eax, 36(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 24(%esi)
+; X86-NOX87-NEXT:    movl %eax, 32(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 20(%esi)
+; X86-NOX87-NEXT:    movl %eax, 28(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 16(%esi)
+; X86-NOX87-NEXT:    movl %eax, 24(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
+; X86-NOX87-NEXT:    movl %eax, 20(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-NOX87-NEXT:    movl %eax, 16(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, 12(%ebp)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    addl $108, %esp
+; X86-NOX87-NEXT:    movl %eax, 8(%ebp)
+; X86-NOX87-NEXT:    calll llrintf
+; X86-NOX87-NEXT:    movl %edx, 4(%ebp)
+; X86-NOX87-NEXT:    movl %eax, (%ebp)
+; X86-NOX87-NEXT:    movl %ebp, %eax
+; X86-NOX87-NEXT:    addl $104, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
@@ -1133,8 +1003,10 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) nounwind {
 ;
 ; X86-NOX87-LABEL: llrint_v1i64_v1f64:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    retl
@@ -1163,56 +1035,47 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    fldl 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    fldl 12(%ebp)
 ; X86-NEXT:    fistpll (%esp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl (%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-NOX87-LABEL: llrint_v2i64_v2f64:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll llrint
-; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    subl $8, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll llrint
-; X86-NOX87-NEXT:    addl $8, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOX87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 12(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-NOX87-NEXT:    movl %edi, (%esi)
+; X86-NOX87-NEXT:    calll llrint
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
+; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    popl %esi
-; X86-NOX87-NEXT:    popl %edi
-; X86-NOX87-NEXT:    popl %ebx
-; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl $4
 ;
 ; SSE-LABEL: llrint_v2i64_v2f64:
@@ -1250,46 +1113,34 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    fldl 36(%ebp)
-; X86-NEXT:    fldl 28(%ebp)
-; X86-NEXT:    fldl 20(%ebp)
-; X86-NEXT:    fldl 12(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    fldl 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, 20(%eax)
-; X86-NEXT:    movl %ebx, 16(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    fldl 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    fldl 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -1299,46 +1150,46 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    subl $12, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll llrint
-; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, %edi
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, %ebx
 ; X86-NOX87-NEXT:    movl %edx, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
-; X86-NOX87-NEXT:    addl $8, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 28(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 24(%esi)
 ; X86-NOX87-NEXT:    movl %ebp, 20(%esi)
 ; X86-NOX87-NEXT:    movl %ebx, 16(%esi)
 ; X86-NOX87-NEXT:    movl %edi, 12(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-NOX87-NEXT:    calll llrint
+; X86-NOX87-NEXT:    addl $8, %esp
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    addl $12, %esp
+; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
@@ -1413,86 +1264,58 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $120, %esp
-; X86-NEXT:    fldl 12(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl 20(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl 28(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl 36(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl 44(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl 52(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NEXT:    fldl 60(%ebp)
-; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    fldl 68(%ebp)
 ; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, 60(%eax)
+; X86-NEXT:    movl %edx, 60(%eax)
 ; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    fldl 60(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, 52(%eax)
-; X86-NEXT:    movl %esi, 48(%eax)
-; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 48(%eax)
+; X86-NEXT:    fldl 52(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 44(%eax)
 ; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    fldl 44(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 36(%eax)
 ; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    fldl 36(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 28(%eax)
 ; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    fldl 28(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 20(%eax)
 ; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    fldl 20(%ebp)
+; X86-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    fldl 12(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -1502,63 +1325,71 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    subl $44, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll llrint
-; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    subl $36, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %edx, %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
 ; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %eax, %ebx
+; X86-NOX87-NEXT:    movl %edx, %ebp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll llrint
-; X86-NOX87-NEXT:    addl $8, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 60(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 56(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 52(%esi)
-; X86-NOX87-NEXT:    movl %edi, 48(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 44(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    movl %ebp, 52(%esi)
+; X86-NOX87-NEXT:    movl %ebx, 48(%esi)
+; X86-NOX87-NEXT:    movl %edi, 44(%esi)
+; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl %eax, 40(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl %eax, 36(%esi)
@@ -1576,12 +1407,12 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
 ; X86-NOX87-NEXT:    movl %eax, 12(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    calll llrint
+; X86-NOX87-NEXT:    addl $8, %esp
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    addl $44, %esp
+; X86-NOX87-NEXT:    addl $36, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
@@ -1700,15 +1531,21 @@ define <1 x i64> @llrint_v1i64_v1f128(<1 x fp128> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
@@ -1716,15 +1553,21 @@ define <1 x i64> @llrint_v1i64_v1f128(<1 x fp128> %x) nounwind {
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
+; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %ebp, %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -1758,35 +1601,36 @@ define <2 x i64> @llrint_v2i64_v2f128(<2 x fp128> %x) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    calll llrintl
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    movl %edx, 12(%esi)
 ; X86-NEXT:    movl %eax, 8(%esi)
-; X86-NEXT:    movl %ebx, 4(%esi)
-; X86-NEXT:    movl %edi, (%esi)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 4(%esi)
+; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
@@ -1794,35 +1638,36 @@ define <2 x i64> @llrint_v2i64_v2f128(<2 x fp128> %x) nounwind {
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-NOX87-NEXT:    pushl 24(%ebp)
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    calll llrintl
-; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl 40(%ebp)
-; X86-NOX87-NEXT:    pushl 36(%ebp)
-; X86-NOX87-NEXT:    pushl 32(%ebp)
-; X86-NOX87-NEXT:    pushl 28(%ebp)
+; X86-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
-; X86-NOX87-NEXT:    addl $16, %esp
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 24(%ebp), %esi
+; X86-NOX87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 16(%ebp), %esi
+; X86-NOX87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl 8(%ebp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 12(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-NOX87-NEXT:    movl %edi, (%esi)
+; X86-NOX87-NEXT:    calll llrintl
+; X86-NOX87-NEXT:    addl $16, %esp
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    leal -12(%ebp), %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
 ; X86-NOX87-NEXT:    popl %esi
-; X86-NOX87-NEXT:    popl %edi
-; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl $4
 ;
@@ -1887,53 +1732,62 @@ define <4 x i64> @llrint_v4i64_v4f128(<4 x fp128> %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl 40(%ebp), %ebx
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    calll llrintl
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl 56(%ebp)
-; X86-NEXT:    pushl 52(%ebp)
-; X86-NEXT:    pushl 48(%ebp)
-; X86-NEXT:    pushl 44(%ebp)
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 56(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    pushl 72(%ebp)
-; X86-NEXT:    pushl 68(%ebp)
-; X86-NEXT:    pushl 64(%ebp)
-; X86-NEXT:    pushl 60(%ebp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 68(%ebp), %eax
+; X86-NEXT:    movl 72(%ebp), %ecx
+; X86-NEXT:    movl 60(%ebp), %edx
+; X86-NEXT:    movl 64(%ebp), %edi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %edx, 28(%esi)
-; X86-NEXT:    movl %eax, 24(%esi)
-; X86-NEXT:    movl %ebx, 20(%esi)
-; X86-NEXT:    movl %edi, 16(%esi)
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    movl %edx, 28(%edi)
+; X86-NEXT:    movl %eax, 24(%edi)
+; X86-NEXT:    movl %esi, 20(%edi)
+; X86-NEXT:    movl %ebx, 16(%edi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%esi)
+; X86-NEXT:    movl %eax, 12(%edi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, 8(%edi)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 4(%edi)
+; X86-NEXT:    movl %eax, (%edi)
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1949,53 +1803,62 @@ define <4 x i64> @llrint_v4i64_v4f128(<4 x fp128> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
-; X86-NOX87-NEXT:    subl $32, %esp
-; X86-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-NOX87-NEXT:    movl 36(%ebp), %edi
-; X86-NOX87-NEXT:    movl 40(%ebp), %ebx
-; X86-NOX87-NEXT:    pushl 24(%ebp)
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
+; X86-NOX87-NEXT:    subl $16, %esp
+; X86-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl 32(%ebp)
-; X86-NOX87-NEXT:    pushl 28(%ebp)
+; X86-NOX87-NEXT:    movl 52(%ebp), %eax
+; X86-NOX87-NEXT:    movl 56(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 44(%ebp), %edx
+; X86-NOX87-NEXT:    movl 48(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 56(%ebp)
-; X86-NOX87-NEXT:    pushl 52(%ebp)
-; X86-NOX87-NEXT:    pushl 48(%ebp)
-; X86-NOX87-NEXT:    pushl 44(%ebp)
+; X86-NOX87-NEXT:    movl %eax, %ebx
+; X86-NOX87-NEXT:    movl %edx, %esi
+; X86-NOX87-NEXT:    movl 68(%ebp), %eax
+; X86-NOX87-NEXT:    movl 72(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 60(%ebp), %edx
+; X86-NOX87-NEXT:    movl 64(%ebp), %edi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %edi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
-; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl 72(%ebp)
-; X86-NOX87-NEXT:    pushl 68(%ebp)
-; X86-NOX87-NEXT:    pushl 64(%ebp)
-; X86-NOX87-NEXT:    pushl 60(%ebp)
+; X86-NOX87-NEXT:    movl 24(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl 8(%ebp), %edi
+; X86-NOX87-NEXT:    movl %edx, 28(%edi)
+; X86-NOX87-NEXT:    movl %eax, 24(%edi)
+; X86-NOX87-NEXT:    movl %esi, 20(%edi)
+; X86-NOX87-NEXT:    movl %ebx, 16(%edi)
+; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    movl %eax, 12(%edi)
+; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    movl %eax, 8(%edi)
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %edx, 28(%esi)
-; X86-NOX87-NEXT:    movl %eax, 24(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 20(%esi)
-; X86-NOX87-NEXT:    movl %edi, 16(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
+; X86-NOX87-NEXT:    movl %edx, 4(%edi)
+; X86-NOX87-NEXT:    movl %eax, (%edi)
+; X86-NOX87-NEXT:    movl %edi, %eax
 ; X86-NOX87-NEXT:    leal -12(%ebp), %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
@@ -2132,72 +1995,97 @@ define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 36(%ebp), %edi
-; X86-NEXT:    movl 40(%ebp), %ebx
-; X86-NEXT:    pushl 24(%ebp)
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl 12(%ebp)
-; X86-NEXT:    calll llrintl
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    pushl 28(%ebp)
+; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl 56(%ebp)
-; X86-NEXT:    pushl 52(%ebp)
-; X86-NEXT:    pushl 48(%ebp)
-; X86-NEXT:    pushl 44(%ebp)
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    movl 56(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl 72(%ebp)
-; X86-NEXT:    pushl 68(%ebp)
-; X86-NEXT:    pushl 64(%ebp)
-; X86-NEXT:    pushl 60(%ebp)
+; X86-NEXT:    movl 68(%ebp), %eax
+; X86-NEXT:    movl 72(%ebp), %ecx
+; X86-NEXT:    movl 60(%ebp), %edx
+; X86-NEXT:    movl 64(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl 88(%ebp)
-; X86-NEXT:    pushl 84(%ebp)
-; X86-NEXT:    pushl 80(%ebp)
-; X86-NEXT:    pushl 76(%ebp)
+; X86-NEXT:    movl 84(%ebp), %eax
+; X86-NEXT:    movl 88(%ebp), %ecx
+; X86-NEXT:    movl 76(%ebp), %edx
+; X86-NEXT:    movl 80(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl 104(%ebp)
-; X86-NEXT:    pushl 100(%ebp)
-; X86-NEXT:    pushl 96(%ebp)
-; X86-NEXT:    pushl 92(%ebp)
+; X86-NEXT:    movl 100(%ebp), %eax
+; X86-NEXT:    movl 104(%ebp), %ecx
+; X86-NEXT:    movl 92(%ebp), %edx
+; X86-NEXT:    movl 96(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl 120(%ebp)
-; X86-NEXT:    pushl 116(%ebp)
-; X86-NEXT:    pushl 112(%ebp)
-; X86-NEXT:    pushl 108(%ebp)
+; X86-NEXT:    movl 116(%ebp), %eax
+; X86-NEXT:    movl 120(%ebp), %ecx
+; X86-NEXT:    movl 108(%ebp), %edx
+; X86-NEXT:    movl 112(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    pushl 136(%ebp)
-; X86-NEXT:    pushl 132(%ebp)
-; X86-NEXT:    pushl 128(%ebp)
-; X86-NEXT:    pushl 124(%ebp)
+; X86-NEXT:    movl 132(%ebp), %eax
+; X86-NEXT:    movl 136(%ebp), %ecx
+; X86-NEXT:    movl 124(%ebp), %edx
+; X86-NEXT:    movl 128(%ebp), %esi
+; X86-NEXT:    pushl %ecx
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edx
 ; X86-NEXT:    calll llrintl
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, (%esp)
+; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    movl %edx, 60(%esi)
 ; X86-NEXT:    movl %eax, 56(%esi)
 ; X86-NEXT:    movl %ebx, 52(%esi)
@@ -2222,9 +2110,9 @@ define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
 ; X86-NEXT:    movl %eax, 12(%esi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 8(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%esi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %edx, 4(%esi)
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
@@ -2242,72 +2130,97 @@ define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
-; X86-NOX87-NEXT:    subl $64, %esp
-; X86-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-NOX87-NEXT:    movl 36(%ebp), %edi
-; X86-NOX87-NEXT:    movl 40(%ebp), %ebx
-; X86-NOX87-NEXT:    pushl 24(%ebp)
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    calll llrintl
-; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl 32(%ebp)
-; X86-NOX87-NEXT:    pushl 28(%ebp)
+; X86-NOX87-NEXT:    subl $48, %esp
+; X86-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 56(%ebp)
-; X86-NOX87-NEXT:    pushl 52(%ebp)
-; X86-NOX87-NEXT:    pushl 48(%ebp)
-; X86-NOX87-NEXT:    pushl 44(%ebp)
+; X86-NOX87-NEXT:    movl 52(%ebp), %eax
+; X86-NOX87-NEXT:    movl 56(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 44(%ebp), %edx
+; X86-NOX87-NEXT:    movl 48(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 72(%ebp)
-; X86-NOX87-NEXT:    pushl 68(%ebp)
-; X86-NOX87-NEXT:    pushl 64(%ebp)
-; X86-NOX87-NEXT:    pushl 60(%ebp)
+; X86-NOX87-NEXT:    movl 68(%ebp), %eax
+; X86-NOX87-NEXT:    movl 72(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 60(%ebp), %edx
+; X86-NOX87-NEXT:    movl 64(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 88(%ebp)
-; X86-NOX87-NEXT:    pushl 84(%ebp)
-; X86-NOX87-NEXT:    pushl 80(%ebp)
-; X86-NOX87-NEXT:    pushl 76(%ebp)
+; X86-NOX87-NEXT:    movl 84(%ebp), %eax
+; X86-NOX87-NEXT:    movl 88(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 76(%ebp), %edx
+; X86-NOX87-NEXT:    movl 80(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 104(%ebp)
-; X86-NOX87-NEXT:    pushl 100(%ebp)
-; X86-NOX87-NEXT:    pushl 96(%ebp)
-; X86-NOX87-NEXT:    pushl 92(%ebp)
+; X86-NOX87-NEXT:    movl 100(%ebp), %eax
+; X86-NOX87-NEXT:    movl 104(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 92(%ebp), %edx
+; X86-NOX87-NEXT:    movl 96(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 120(%ebp)
-; X86-NOX87-NEXT:    pushl 116(%ebp)
-; X86-NOX87-NEXT:    pushl 112(%ebp)
-; X86-NOX87-NEXT:    pushl 108(%ebp)
+; X86-NOX87-NEXT:    movl 116(%ebp), %eax
+; X86-NOX87-NEXT:    movl 120(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 108(%ebp), %edx
+; X86-NOX87-NEXT:    movl 112(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edi
 ; X86-NOX87-NEXT:    movl %edx, %ebx
-; X86-NOX87-NEXT:    pushl 136(%ebp)
-; X86-NOX87-NEXT:    pushl 132(%ebp)
-; X86-NOX87-NEXT:    pushl 128(%ebp)
-; X86-NOX87-NEXT:    pushl 124(%ebp)
+; X86-NOX87-NEXT:    movl 132(%ebp), %eax
+; X86-NOX87-NEXT:    movl 136(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 124(%ebp), %edx
+; X86-NOX87-NEXT:    movl 128(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll llrintl
-; X86-NOX87-NEXT:    addl $16, %esp
+; X86-NOX87-NEXT:    movl 24(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl 8(%ebp), %esi
 ; X86-NOX87-NEXT:    movl %edx, 60(%esi)
 ; X86-NOX87-NEXT:    movl %eax, 56(%esi)
 ; X86-NOX87-NEXT:    movl %ebx, 52(%esi)
@@ -2332,9 +2245,9 @@ define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
 ; X86-NOX87-NEXT:    movl %eax, 12(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    calll llrintl
+; X86-NOX87-NEXT:    addl $16, %esp
+; X86-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
 ; X86-NOX87-NEXT:    leal -12(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll
index 86070fa541b84..4b0054cdc7a48 100644
--- a/llvm/test/CodeGen/X86/vector-lrint.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint.ll
@@ -26,7 +26,8 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) nounwind {
 ;
 ; X86-NOX87-LABEL: lrint_v1f32:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    retl
@@ -47,7 +48,8 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) nounwind {
 ;
 ; X86-I64-NOX87-LABEL: lrint_v1f32:
 ; X86-I64-NOX87:       # %bb.0:
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrintf
 ; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    retl
@@ -81,8 +83,8 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) nounwind {
 ; X86-I32:       # %bb.0:
 ; X86-I32-NEXT:    subl $8, %esp
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    movl (%esp), %eax
 ; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -91,74 +93,62 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) nounwind {
 ;
 ; X86-NOX87-LABEL: lrint_v2f32:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, %esi
-; X86-NOX87-NEXT:    pushl %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrintf
 ; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edx
 ; X86-NOX87-NEXT:    movl %esi, %eax
 ; X86-NOX87-NEXT:    popl %esi
-; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    retl
 ;
 ; X86-I64-LABEL: lrint_v2f32:
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-8, %esp
 ; X86-I64-NEXT:    subl $16, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %eax
 ; X86-I64-NEXT:    flds 16(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 12(%eax)
+; X86-I64-NEXT:    movl %ecx, 8(%eax)
 ; X86-I64-NEXT:    flds 12(%ebp)
 ; X86-I64-NEXT:    fistpll (%esp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-I64-NEXT:    movl (%esp), %ecx
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NEXT:    movl %edi, 12(%eax)
-; X86-I64-NEXT:    movl %esi, 8(%eax)
 ; X86-I64-NEXT:    movl %edx, 4(%eax)
 ; X86-I64-NEXT:    movl %ecx, (%eax)
-; X86-I64-NEXT:    leal -8(%ebp), %esp
-; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    movl %ebp, %esp
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
 ; X86-I64-NOX87-LABEL: lrint_v2f32:
 ; X86-I64-NOX87:       # %bb.0:
-; X86-I64-NOX87-NEXT:    pushl %ebp
-; X86-I64-NOX87-NEXT:    pushl %ebx
-; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, %edi
-; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl %ebp
-; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 12(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-I64-NOX87-NEXT:    movl %edi, (%esi)
+; X86-I64-NOX87-NEXT:    calll lrintf
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
+; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
-; X86-I64-NOX87-NEXT:    popl %edi
-; X86-I64-NOX87-NEXT:    popl %ebx
-; X86-I64-NOX87-NEXT:    popl %ebp
 ; X86-I64-NOX87-NEXT:    retl $4
 ;
 ; X86-SSE2-LABEL: lrint_v2f32:
@@ -208,110 +198,91 @@ declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
 define <4 x iXLen> @lrint_v4f32(<4 x float> %x) nounwind {
 ; X86-I32-LABEL: lrint_v4f32:
 ; X86-I32:       # %bb.0:
-; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl %esi
 ; X86-I32-NEXT:    subl $16, %esp
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 12(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 8(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 4(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl (%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    movl (%esp), %ecx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I32-NEXT:    movl %edi, 12(%eax)
-; X86-I32-NEXT:    movl %esi, 8(%eax)
-; X86-I32-NEXT:    movl %edx, 4(%eax)
 ; X86-I32-NEXT:    movl %ecx, (%eax)
 ; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    popl %esi
-; X86-I32-NEXT:    popl %edi
 ; X86-I32-NEXT:    retl $4
 ;
 ; X86-NOX87-LABEL: lrint_v4f32:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl %eax, %esi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    pushl %ebp
+; X86-NOX87-NEXT:    movl %eax, %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOX87-NEXT:    movl %eax, 12(%ebx)
+; X86-NOX87-NEXT:    movl %edi, 8(%ebx)
+; X86-NOX87-NEXT:    movl %esi, 4(%ebx)
 ; X86-NOX87-NEXT:    calll lrintf
+; X86-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-NOX87-NEXT:    movl %ebx, %eax
 ; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 8(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-NOX87-NEXT:    movl %edi, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
-; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl $4
 ;
 ; X86-I64-LABEL: lrint_v4f32:
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-8, %esp
-; X86-I64-NEXT:    subl $56, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    subl $32, %esp
 ; X86-I64-NEXT:    flds 24(%ebp)
-; X86-I64-NEXT:    flds 20(%ebp)
-; X86-I64-NEXT:    flds 16(%ebp)
-; X86-I64-NEXT:    flds 12(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I64-NEXT:    movl 8(%ebp), %eax
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NEXT:    movl %esi, 28(%eax)
+; X86-I64-NEXT:    movl %edx, 28(%eax)
 ; X86-I64-NEXT:    movl %ecx, 24(%eax)
+; X86-I64-NEXT:    flds 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-I64-NEXT:    movl %edx, 20(%eax)
-; X86-I64-NEXT:    movl %ebx, 16(%eax)
-; X86-I64-NEXT:    movl %edi, 12(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 16(%eax)
+; X86-I64-NEXT:    flds 16(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 12(%eax)
 ; X86-I64-NEXT:    movl %ecx, 8(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 4(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    flds 12(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    movl (%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 4(%eax)
 ; X86-I64-NEXT:    movl %ecx, (%eax)
-; X86-I64-NEXT:    leal -12(%ebp), %esp
-; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
-; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    movl %ebp, %esp
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
@@ -321,41 +292,35 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    pushl %ebx
 ; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
-; X86-I64-NOX87-NEXT:    subl $12, %esp
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    subl $8, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %edi
-; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, %edi
-; X86-I64-NOX87-NEXT:    pushl %ebx
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, %ebx
 ; X86-I64-NOX87-NEXT:    movl %edx, %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 28(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 24(%esi)
 ; X86-I64-NOX87-NEXT:    movl %ebp, 20(%esi)
 ; X86-I64-NOX87-NEXT:    movl %ebx, 16(%esi)
 ; X86-I64-NOX87-NEXT:    movl %edi, 12(%esi)
-; X86-I64-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 4(%esi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NOX87-NEXT:    calll lrintf
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
-; X86-I64-NOX87-NEXT:    addl $12, %esp
+; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
 ; X86-I64-NOX87-NEXT:    popl %edi
 ; X86-I64-NOX87-NEXT:    popl %ebx
@@ -425,53 +390,41 @@ declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
 define <8 x iXLen> @lrint_v8f32(<8 x float> %x) nounwind {
 ; X86-I32-LABEL: lrint_v8f32:
 ; X86-I32:       # %bb.0:
-; X86-I32-NEXT:    pushl %ebp
-; X86-I32-NEXT:    pushl %ebx
-; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl %esi
-; X86-I32-NEXT:    subl $40, %esp
-; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    subl $32, %esp
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 28(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 24(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 20(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 16(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 12(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 8(%eax)
 ; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I32-NEXT:    movl %edx, 28(%eax)
-; X86-I32-NEXT:    movl %ecx, 24(%eax)
-; X86-I32-NEXT:    movl %ebp, 20(%eax)
-; X86-I32-NEXT:    movl %ebx, 16(%eax)
-; X86-I32-NEXT:    movl %edi, 12(%eax)
-; X86-I32-NEXT:    movl %esi, 8(%eax)
-; X86-I32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-I32-NEXT:    movl %ecx, 4(%eax)
-; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I32-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    movl (%esp), %ecx
 ; X86-I32-NEXT:    movl %ecx, (%eax)
-; X86-I32-NEXT:    addl $40, %esp
-; X86-I32-NEXT:    popl %esi
-; X86-I32-NEXT:    popl %edi
-; X86-I32-NEXT:    popl %ebx
-; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    addl $32, %esp
 ; X86-I32-NEXT:    retl $4
 ;
 ; X86-NOX87-LABEL: lrint_v8f32:
@@ -481,54 +434,49 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
+; X86-NOX87-NEXT:    calll lrintf
+; X86-NOX87-NEXT:    movl %eax, %esi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-NOX87-NEXT:    calll lrintf
-; X86-NOX87-NEXT:    addl $4, %esp
-; X86-NOX87-NEXT:    movl %eax, 28(%esi)
-; X86-NOX87-NEXT:    movl %edi, 24(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 20(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 16(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOX87-NEXT:    movl %eax, 28(%ebx)
+; X86-NOX87-NEXT:    movl %edi, 24(%ebx)
+; X86-NOX87-NEXT:    movl %esi, 20(%ebx)
+; X86-NOX87-NEXT:    movl %ebp, 16(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-NOX87-NEXT:    movl %eax, 12(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, 8(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
+; X86-NOX87-NEXT:    movl %eax, 4(%ebx)
+; X86-NOX87-NEXT:    calll lrintf
+; X86-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-NOX87-NEXT:    movl %ebx, %eax
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
@@ -540,86 +488,58 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) nounwind {
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-8, %esp
-; X86-I64-NEXT:    subl $120, %esp
-; X86-I64-NEXT:    flds 12(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    flds 16(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    flds 20(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    flds 24(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    flds 28(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    flds 32(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    flds 36(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    subl $64, %esp
 ; X86-I64-NEXT:    flds 40(%ebp)
 ; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-I64-NEXT:    movl 8(%ebp), %eax
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I64-NEXT:    movl %ebx, 60(%eax)
+; X86-I64-NEXT:    movl %edx, 60(%eax)
 ; X86-I64-NEXT:    movl %ecx, 56(%eax)
+; X86-I64-NEXT:    flds 36(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-I64-NEXT:    movl %edx, 52(%eax)
-; X86-I64-NEXT:    movl %esi, 48(%eax)
-; X86-I64-NEXT:    movl %edi, 44(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 48(%eax)
+; X86-I64-NEXT:    flds 32(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 44(%eax)
 ; X86-I64-NEXT:    movl %ecx, 40(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 36(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    flds 28(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 36(%eax)
 ; X86-I64-NEXT:    movl %ecx, 32(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 28(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    flds 24(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 28(%eax)
 ; X86-I64-NEXT:    movl %ecx, 24(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 20(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    flds 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 20(%eax)
 ; X86-I64-NEXT:    movl %ecx, 16(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 12(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    flds 16(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 12(%eax)
 ; X86-I64-NEXT:    movl %ecx, 8(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 4(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    flds 12(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    movl (%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 4(%eax)
 ; X86-I64-NEXT:    movl %ecx, (%eax)
-; X86-I64-NEXT:    leal -12(%ebp), %esp
-; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
-; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    movl %ebp, %esp
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
@@ -629,78 +549,71 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    pushl %ebx
 ; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
-; X86-I64-NOX87-NEXT:    subl $44, %esp
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    subl $40, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %ebp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %ebx
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %edi
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %edx, %edi
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, %edi
-; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %eax, %ebp
+; X86-I64-NOX87-NEXT:    movl %edx, %esi
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrintf
-; X86-I64-NOX87-NEXT:    addl $4, %esp
-; X86-I64-NOX87-NEXT:    movl %edx, 60(%esi)
-; X86-I64-NOX87-NEXT:    movl %eax, 56(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebx, 52(%esi)
-; X86-I64-NOX87-NEXT:    movl %edi, 48(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebp, 44(%esi)
-; X86-I64-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 40(%esi)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-I64-NOX87-NEXT:    movl %edx, 60(%ebx)
+; X86-I64-NOX87-NEXT:    movl %eax, 56(%ebx)
+; X86-I64-NOX87-NEXT:    movl %esi, 52(%ebx)
+; X86-I64-NOX87-NEXT:    movl %ebp, 48(%ebx)
+; X86-I64-NOX87-NEXT:    movl %edi, 44(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 36(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 40(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 32(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 36(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 28(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 32(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 24(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 28(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 20(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 24(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 16(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 20(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 12(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 16(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 4(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 12(%ebx)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
-; X86-I64-NOX87-NEXT:    movl %esi, %eax
-; X86-I64-NOX87-NEXT:    addl $44, %esp
+; X86-I64-NOX87-NEXT:    movl %eax, 8(%ebx)
+; X86-I64-NOX87-NEXT:    calll lrintf
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%ebx)
+; X86-I64-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-I64-NOX87-NEXT:    movl %ebx, %eax
+; X86-I64-NOX87-NEXT:    addl $40, %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
 ; X86-I64-NOX87-NEXT:    popl %edi
 ; X86-I64-NOX87-NEXT:    popl %ebx
@@ -818,8 +731,10 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) nounwind {
 ;
 ; X86-NOX87-LABEL: lrint_v1f64:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    retl
@@ -840,8 +755,10 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) nounwind {
 ;
 ; X86-I64-NOX87-LABEL: lrint_v1f64:
 ; X86-I64-NOX87:       # %bb.0:
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    retl
@@ -875,8 +792,8 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) nounwind {
 ; X86-I32:       # %bb.0:
 ; X86-I32-NEXT:    subl $8, %esp
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    movl (%esp), %eax
 ; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -885,81 +802,70 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) nounwind {
 ;
 ; X86-NOX87-LABEL: lrint_v2f64:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, %esi
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edx
 ; X86-NOX87-NEXT:    movl %esi, %eax
 ; X86-NOX87-NEXT:    popl %esi
-; X86-NOX87-NEXT:    popl %edi
-; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    retl
 ;
 ; X86-I64-LABEL: lrint_v2f64:
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-8, %esp
 ; X86-I64-NEXT:    subl $16, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %eax
 ; X86-I64-NEXT:    fldl 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 12(%eax)
+; X86-I64-NEXT:    movl %ecx, 8(%eax)
 ; X86-I64-NEXT:    fldl 12(%ebp)
 ; X86-I64-NEXT:    fistpll (%esp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-I64-NEXT:    movl (%esp), %ecx
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NEXT:    movl %edi, 12(%eax)
-; X86-I64-NEXT:    movl %esi, 8(%eax)
 ; X86-I64-NEXT:    movl %edx, 4(%eax)
 ; X86-I64-NEXT:    movl %ecx, (%eax)
-; X86-I64-NEXT:    leal -8(%ebp), %esp
-; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
+; X86-I64-NEXT:    movl %ebp, %esp
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
 ; X86-I64-NOX87-LABEL: lrint_v2f64:
 ; X86-I64-NOX87:       # %bb.0:
-; X86-I64-NOX87-NEXT:    pushl %ebp
-; X86-I64-NOX87-NEXT:    pushl %ebx
-; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    subl $8, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %eax, (%esp)
 ; X86-I64-NOX87-NEXT:    calll lrint
-; X86-I64-NOX87-NEXT:    addl $8, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, %edi
-; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    calll lrint
-; X86-I64-NOX87-NEXT:    addl $8, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-I64-NOX87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 12(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-I64-NOX87-NEXT:    movl %edi, (%esi)
+; X86-I64-NOX87-NEXT:    calll lrint
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
+; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
-; X86-I64-NOX87-NEXT:    popl %edi
-; X86-I64-NOX87-NEXT:    popl %ebx
-; X86-I64-NOX87-NEXT:    popl %ebp
 ; X86-I64-NOX87-NEXT:    retl $4
 ;
 ; X86-SSE2-LABEL: lrint_v2f64:
@@ -1009,114 +915,100 @@ declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double>)
 define <4 x iXLen> @lrint_v4f64(<4 x double> %x) nounwind {
 ; X86-I32-LABEL: lrint_v4f64:
 ; X86-I32:       # %bb.0:
-; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl %esi
 ; X86-I32-NEXT:    subl $16, %esp
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 12(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 8(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 4(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl (%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    movl (%esp), %ecx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I32-NEXT:    movl %edi, 12(%eax)
-; X86-I32-NEXT:    movl %esi, 8(%eax)
-; X86-I32-NEXT:    movl %edx, 4(%eax)
 ; X86-I32-NEXT:    movl %ecx, (%eax)
 ; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    popl %esi
-; X86-I32-NEXT:    popl %edi
 ; X86-I32-NEXT:    retl $4
 ;
 ; X86-NOX87-LABEL: lrint_v4f64:
 ; X86-NOX87:       # %bb.0:
-; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl %eax, %esi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %eax, %edi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
-; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOX87-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOX87-NEXT:    movl %eax, 12(%ebx)
+; X86-NOX87-NEXT:    movl %edi, 8(%ebx)
+; X86-NOX87-NEXT:    movl %esi, 4(%ebx)
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 8(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-NOX87-NEXT:    movl %edi, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
+; X86-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-NOX87-NEXT:    movl %ebx, %eax
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
-; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl $4
 ;
 ; X86-I64-LABEL: lrint_v4f64:
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-8, %esp
-; X86-I64-NEXT:    subl $56, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %eax
+; X86-I64-NEXT:    subl $32, %esp
 ; X86-I64-NEXT:    fldl 36(%ebp)
-; X86-I64-NEXT:    fldl 28(%ebp)
-; X86-I64-NEXT:    fldl 20(%ebp)
-; X86-I64-NEXT:    fldl 12(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %eax
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NEXT:    movl %esi, 28(%eax)
+; X86-I64-NEXT:    movl %edx, 28(%eax)
 ; X86-I64-NEXT:    movl %ecx, 24(%eax)
+; X86-I64-NEXT:    fldl 28(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-I64-NEXT:    movl %edx, 20(%eax)
-; X86-I64-NEXT:    movl %ebx, 16(%eax)
-; X86-I64-NEXT:    movl %edi, 12(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 16(%eax)
+; X86-I64-NEXT:    fldl 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 12(%eax)
 ; X86-I64-NEXT:    movl %ecx, 8(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 4(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    fldl 12(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    movl (%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 4(%eax)
 ; X86-I64-NEXT:    movl %ecx, (%eax)
-; X86-I64-NEXT:    leal -12(%ebp), %esp
-; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
-; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    movl %ebp, %esp
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
@@ -1126,46 +1018,46 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    pushl %ebx
 ; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
-; X86-I64-NOX87-NEXT:    subl $12, %esp
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    calll lrint
-; X86-I64-NOX87-NEXT:    addl $8, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %ebx
-; X86-I64-NOX87-NEXT:    pushl %edi
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, %edi
-; X86-I64-NOX87-NEXT:    pushl %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, %ebx
 ; X86-I64-NOX87-NEXT:    movl %edx, %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
-; X86-I64-NOX87-NEXT:    addl $8, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 28(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 24(%esi)
 ; X86-I64-NOX87-NEXT:    movl %ebp, 20(%esi)
 ; X86-I64-NOX87-NEXT:    movl %ebx, 16(%esi)
 ; X86-I64-NOX87-NEXT:    movl %edi, 12(%esi)
-; X86-I64-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 4(%esi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NOX87-NEXT:    calll lrint
+; X86-I64-NOX87-NEXT:    addl $8, %esp
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
-; X86-I64-NOX87-NEXT:    addl $12, %esp
+; X86-I64-NOX87-NEXT:    addl $4, %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
 ; X86-I64-NOX87-NEXT:    popl %edi
 ; X86-I64-NOX87-NEXT:    popl %ebx
@@ -1239,53 +1131,41 @@ declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double>)
 define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
 ; X86-I32-LABEL: lrint_v8f64:
 ; X86-I32:       # %bb.0:
-; X86-I32-NEXT:    pushl %ebp
-; X86-I32-NEXT:    pushl %ebx
-; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl %esi
-; X86-I32-NEXT:    subl $40, %esp
-; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    subl $32, %esp
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 28(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 24(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 20(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 16(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 12(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I32-NEXT:    movl %ecx, 8(%eax)
 ; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-I32-NEXT:    fistpl {{[0-9]+}}(%esp)
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I32-NEXT:    movl %edx, 28(%eax)
-; X86-I32-NEXT:    movl %ecx, 24(%eax)
-; X86-I32-NEXT:    movl %ebp, 20(%eax)
-; X86-I32-NEXT:    movl %ebx, 16(%eax)
-; X86-I32-NEXT:    movl %edi, 12(%eax)
-; X86-I32-NEXT:    movl %esi, 8(%eax)
-; X86-I32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-I32-NEXT:    movl %ecx, 4(%eax)
-; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    fistpl (%esp)
+; X86-I32-NEXT:    movl (%esp), %ecx
 ; X86-I32-NEXT:    movl %ecx, (%eax)
-; X86-I32-NEXT:    addl $40, %esp
-; X86-I32-NEXT:    popl %esi
-; X86-I32-NEXT:    popl %edi
-; X86-I32-NEXT:    popl %ebx
-; X86-I32-NEXT:    popl %ebp
+; X86-I32-NEXT:    addl $32, %esp
 ; X86-I32-NEXT:    retl $4
 ;
 ; X86-NOX87-LABEL: lrint_v8f64:
@@ -1294,64 +1174,74 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
-; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    calll lrint
-; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %ebp
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    subl $12, %esp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %eax, %ebp
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebp
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %eax, %esi
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
 ; X86-NOX87-NEXT:    addl $8, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    calll lrint
-; X86-NOX87-NEXT:    addl $8, %esp
-; X86-NOX87-NEXT:    movl %eax, 28(%esi)
-; X86-NOX87-NEXT:    movl %edi, 24(%esi)
-; X86-NOX87-NEXT:    movl %ebp, 20(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 16(%esi)
-; X86-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOX87-NEXT:    movl %eax, 28(%ebx)
+; X86-NOX87-NEXT:    movl %edi, 24(%ebx)
+; X86-NOX87-NEXT:    movl %esi, 20(%ebx)
+; X86-NOX87-NEXT:    movl %ebp, 16(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-NOX87-NEXT:    movl %eax, 12(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, 4(%esi)
+; X86-NOX87-NEXT:    movl %eax, 8(%ebx)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    addl $16, %esp
+; X86-NOX87-NEXT:    movl %eax, 4(%ebx)
+; X86-NOX87-NEXT:    calll lrint
+; X86-NOX87-NEXT:    addl $8, %esp
+; X86-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-NOX87-NEXT:    movl %ebx, %eax
+; X86-NOX87-NEXT:    addl $12, %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
 ; X86-NOX87-NEXT:    popl %ebx
@@ -1362,86 +1252,58 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-8, %esp
-; X86-I64-NEXT:    subl $120, %esp
-; X86-I64-NEXT:    fldl 12(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fldl 20(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fldl 28(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fldl 36(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fldl 44(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fldl 52(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-I64-NEXT:    fldl 60(%ebp)
-; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    subl $64, %esp
 ; X86-I64-NEXT:    fldl 68(%ebp)
 ; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
 ; X86-I64-NEXT:    movl 8(%ebp), %eax
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I64-NEXT:    movl %ebx, 60(%eax)
+; X86-I64-NEXT:    movl %edx, 60(%eax)
 ; X86-I64-NEXT:    movl %ecx, 56(%eax)
+; X86-I64-NEXT:    fldl 60(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-I64-NEXT:    movl %edx, 52(%eax)
-; X86-I64-NEXT:    movl %esi, 48(%eax)
-; X86-I64-NEXT:    movl %edi, 44(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    movl %ecx, 48(%eax)
+; X86-I64-NEXT:    fldl 52(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 44(%eax)
 ; X86-I64-NEXT:    movl %ecx, 40(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 36(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    fldl 44(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 36(%eax)
 ; X86-I64-NEXT:    movl %ecx, 32(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 28(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    fldl 36(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 28(%eax)
 ; X86-I64-NEXT:    movl %ecx, 24(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 20(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    fldl 28(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 20(%eax)
 ; X86-I64-NEXT:    movl %ecx, 16(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 12(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    fldl 20(%ebp)
+; X86-I64-NEXT:    fistpll {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 12(%eax)
 ; X86-I64-NEXT:    movl %ecx, 8(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-I64-NEXT:    movl %ecx, 4(%eax)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-I64-NEXT:    fldl 12(%ebp)
+; X86-I64-NEXT:    fistpll (%esp)
+; X86-I64-NEXT:    movl (%esp), %ecx
+; X86-I64-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-I64-NEXT:    movl %edx, 4(%eax)
 ; X86-I64-NEXT:    movl %ecx, (%eax)
-; X86-I64-NEXT:    leal -12(%ebp), %esp
-; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
-; X86-I64-NEXT:    popl %ebx
+; X86-I64-NEXT:    movl %ebp, %esp
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
@@ -1451,63 +1313,71 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    pushl %ebx
 ; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
-; X86-I64-NOX87-NEXT:    subl $44, %esp
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    calll lrint
-; X86-I64-NOX87-NEXT:    addl $8, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %ebp
-; X86-I64-NOX87-NEXT:    pushl %ebx
+; X86-I64-NOX87-NEXT:    subl $36, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %edi
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, %ebp
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %edx, %edi
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
 ; X86-I64-NOX87-NEXT:    addl $8, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, %edi
-; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-I64-NOX87-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %eax, %ebx
+; X86-I64-NOX87-NEXT:    movl %edx, %ebp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
 ; X86-I64-NOX87-NEXT:    calll lrint
-; X86-I64-NOX87-NEXT:    addl $8, %esp
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 60(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 56(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebx, 52(%esi)
-; X86-I64-NOX87-NEXT:    movl %edi, 48(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebp, 44(%esi)
-; X86-I64-NOX87-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-I64-NOX87-NEXT:    movl %ebp, 52(%esi)
+; X86-I64-NOX87-NEXT:    movl %ebx, 48(%esi)
+; X86-I64-NOX87-NEXT:    movl %edi, 44(%esi)
+; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I64-NOX87-NEXT:    movl %eax, 40(%esi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I64-NOX87-NEXT:    movl %eax, 36(%esi)
@@ -1525,12 +1395,12 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    movl %eax, 12(%esi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 4(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NOX87-NEXT:    calll lrint
+; X86-I64-NOX87-NEXT:    addl $8, %esp
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
-; X86-I64-NOX87-NEXT:    addl $44, %esp
+; X86-I64-NOX87-NEXT:    addl $36, %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
 ; X86-I64-NOX87-NEXT:    popl %edi
 ; X86-I64-NOX87-NEXT:    popl %ebx
@@ -1652,15 +1522,21 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) nounwind {
 ; X86-I32:       # %bb.0:
 ; X86-I32-NEXT:    pushl %ebp
 ; X86-I32-NEXT:    movl %esp, %ebp
+; X86-I32-NEXT:    pushl %esi
 ; X86-I32-NEXT:    andl $-16, %esp
 ; X86-I32-NEXT:    subl $16, %esp
-; X86-I32-NEXT:    pushl 20(%ebp)
-; X86-I32-NEXT:    pushl 16(%ebp)
-; X86-I32-NEXT:    pushl 12(%ebp)
-; X86-I32-NEXT:    pushl 8(%ebp)
+; X86-I32-NEXT:    movl 16(%ebp), %eax
+; X86-I32-NEXT:    movl 20(%ebp), %ecx
+; X86-I32-NEXT:    movl 8(%ebp), %edx
+; X86-I32-NEXT:    movl 12(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    movl %ebp, %esp
+; X86-I32-NEXT:    leal -4(%ebp), %esp
+; X86-I32-NEXT:    popl %esi
 ; X86-I32-NEXT:    popl %ebp
 ; X86-I32-NEXT:    retl
 ;
@@ -1668,15 +1544,21 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) nounwind {
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
+; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %ebp, %esp
+; X86-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -1684,15 +1566,21 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) nounwind {
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
+; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-16, %esp
 ; X86-I64-NEXT:    subl $16, %esp
-; X86-I64-NEXT:    pushl 20(%ebp)
-; X86-I64-NEXT:    pushl 16(%ebp)
-; X86-I64-NEXT:    pushl 12(%ebp)
-; X86-I64-NEXT:    pushl 8(%ebp)
+; X86-I64-NEXT:    movl 16(%ebp), %eax
+; X86-I64-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NEXT:    movl 8(%ebp), %edx
+; X86-I64-NEXT:    movl 12(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
-; X86-I64-NEXT:    movl %ebp, %esp
+; X86-I64-NEXT:    leal -4(%ebp), %esp
+; X86-I64-NEXT:    popl %esi
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl
 ;
@@ -1700,15 +1588,21 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) nounwind {
 ; X86-I64-NOX87:       # %bb.0:
 ; X86-I64-NOX87-NEXT:    pushl %ebp
 ; X86-I64-NOX87-NEXT:    movl %esp, %ebp
+; X86-I64-NOX87-NEXT:    pushl %esi
 ; X86-I64-NOX87-NEXT:    andl $-16, %esp
 ; X86-I64-NOX87-NEXT:    subl $16, %esp
-; X86-I64-NOX87-NEXT:    pushl 20(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 16(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 12(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 8(%ebp)
+; X86-I64-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
-; X86-I64-NOX87-NEXT:    movl %ebp, %esp
+; X86-I64-NOX87-NEXT:    leal -4(%ebp), %esp
+; X86-I64-NOX87-NEXT:    popl %esi
 ; X86-I64-NOX87-NEXT:    popl %ebp
 ; X86-I64-NOX87-NEXT:    retl
 ;
@@ -1716,15 +1610,21 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl 16(%ebp), %eax
+; X86-SSE2-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
+; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1764,32 +1664,35 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) nounwind {
 ; X86-I32:       # %bb.0:
 ; X86-I32-NEXT:    pushl %ebp
 ; X86-I32-NEXT:    movl %esp, %ebp
-; X86-I32-NEXT:    pushl %ebx
 ; X86-I32-NEXT:    pushl %edi
 ; X86-I32-NEXT:    pushl %esi
 ; X86-I32-NEXT:    andl $-16, %esp
-; X86-I32-NEXT:    subl $16, %esp
-; X86-I32-NEXT:    movl 32(%ebp), %edi
-; X86-I32-NEXT:    movl 36(%ebp), %ebx
-; X86-I32-NEXT:    pushl 20(%ebp)
-; X86-I32-NEXT:    pushl 16(%ebp)
-; X86-I32-NEXT:    pushl 12(%ebp)
-; X86-I32-NEXT:    pushl 8(%ebp)
+; X86-I32-NEXT:    movl 16(%ebp), %eax
+; X86-I32-NEXT:    movl 20(%ebp), %ecx
+; X86-I32-NEXT:    movl 8(%ebp), %edx
+; X86-I32-NEXT:    movl 12(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, %esi
-; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    movl 32(%ebp), %eax
+; X86-I32-NEXT:    movl 36(%ebp), %ecx
+; X86-I32-NEXT:    movl 24(%ebp), %edx
+; X86-I32-NEXT:    movl 28(%ebp), %edi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
 ; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl 28(%ebp)
-; X86-I32-NEXT:    pushl 24(%ebp)
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, %edx
 ; X86-I32-NEXT:    movl %esi, %eax
-; X86-I32-NEXT:    leal -12(%ebp), %esp
+; X86-I32-NEXT:    leal -8(%ebp), %esp
 ; X86-I32-NEXT:    popl %esi
 ; X86-I32-NEXT:    popl %edi
-; X86-I32-NEXT:    popl %ebx
 ; X86-I32-NEXT:    popl %ebp
 ; X86-I32-NEXT:    retl
 ;
@@ -1797,32 +1700,35 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) nounwind {
 ; X86-NOX87:       # %bb.0:
 ; X86-NOX87-NEXT:    pushl %ebp
 ; X86-NOX87-NEXT:    movl %esp, %ebp
-; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %edi
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
-; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    movl 32(%ebp), %edi
-; X86-NOX87-NEXT:    movl 36(%ebp), %ebx
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
-; X86-NOX87-NEXT:    pushl 8(%ebp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %eax
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 8(%ebp), %edx
+; X86-NOX87-NEXT:    movl 12(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, %esi
-; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    movl 32(%ebp), %eax
+; X86-NOX87-NEXT:    movl 36(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 24(%ebp), %edx
+; X86-NOX87-NEXT:    movl 28(%ebp), %edi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl 28(%ebp)
-; X86-NOX87-NEXT:    pushl 24(%ebp)
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edx
 ; X86-NOX87-NEXT:    movl %esi, %eax
-; X86-NOX87-NEXT:    leal -12(%ebp), %esp
+; X86-NOX87-NEXT:    leal -8(%ebp), %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
-; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    popl %ebp
 ; X86-NOX87-NEXT:    retl
 ;
@@ -1830,35 +1736,36 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) nounwind {
 ; X86-I64:       # %bb.0:
 ; X86-I64-NEXT:    pushl %ebp
 ; X86-I64-NEXT:    movl %esp, %ebp
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
 ; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-16, %esp
 ; X86-I64-NEXT:    subl $16, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %esi
-; X86-I64-NEXT:    pushl 24(%ebp)
-; X86-I64-NEXT:    pushl 20(%ebp)
-; X86-I64-NEXT:    pushl 16(%ebp)
-; X86-I64-NEXT:    pushl 12(%ebp)
-; X86-I64-NEXT:    calll lrintl
-; X86-I64-NEXT:    addl $16, %esp
-; X86-I64-NEXT:    movl %eax, %edi
-; X86-I64-NEXT:    movl %edx, %ebx
-; X86-I64-NEXT:    pushl 40(%ebp)
-; X86-I64-NEXT:    pushl 36(%ebp)
-; X86-I64-NEXT:    pushl 32(%ebp)
-; X86-I64-NEXT:    pushl 28(%ebp)
+; X86-I64-NEXT:    movl 36(%ebp), %eax
+; X86-I64-NEXT:    movl 40(%ebp), %ecx
+; X86-I64-NEXT:    movl 28(%ebp), %edx
+; X86-I64-NEXT:    movl 32(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
-; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NEXT:    movl 24(%ebp), %esi
+; X86-I64-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 12(%ebp), %ecx
+; X86-I64-NEXT:    movl 16(%ebp), %esi
+; X86-I64-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl %ecx, (%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %esi
 ; X86-I64-NEXT:    movl %edx, 12(%esi)
 ; X86-I64-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NEXT:    movl %ebx, 4(%esi)
-; X86-I64-NEXT:    movl %edi, (%esi)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %edx, 4(%esi)
+; X86-I64-NEXT:    movl %eax, (%esi)
 ; X86-I64-NEXT:    movl %esi, %eax
-; X86-I64-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NEXT:    leal -4(%ebp), %esp
 ; X86-I64-NEXT:    popl %esi
-; X86-I64-NEXT:    popl %edi
-; X86-I64-NEXT:    popl %ebx
 ; X86-I64-NEXT:    popl %ebp
 ; X86-I64-NEXT:    retl $4
 ;
@@ -1866,35 +1773,36 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) nounwind {
 ; X86-I64-NOX87:       # %bb.0:
 ; X86-I64-NOX87-NEXT:    pushl %ebp
 ; X86-I64-NOX87-NEXT:    movl %esp, %ebp
-; X86-I64-NOX87-NEXT:    pushl %ebx
-; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
 ; X86-I64-NOX87-NEXT:    andl $-16, %esp
 ; X86-I64-NOX87-NEXT:    subl $16, %esp
-; X86-I64-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-I64-NOX87-NEXT:    pushl 24(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 20(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 16(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 12(%ebp)
-; X86-I64-NOX87-NEXT:    calll lrintl
-; X86-I64-NOX87-NEXT:    addl $16, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, %edi
-; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl 40(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 36(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 32(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 28(%ebp)
+; X86-I64-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
-; X86-I64-NOX87-NEXT:    addl $16, %esp
+; X86-I64-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 24(%ebp), %esi
+; X86-I64-NOX87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 16(%ebp), %esi
+; X86-I64-NOX87-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl 8(%ebp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 12(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-I64-NOX87-NEXT:    movl %edi, (%esi)
+; X86-I64-NOX87-NEXT:    calll lrintl
+; X86-I64-NOX87-NEXT:    addl $16, %esp
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
-; X86-I64-NOX87-NEXT:    leal -12(%ebp), %esp
+; X86-I64-NOX87-NEXT:    leal -4(%ebp), %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
-; X86-I64-NOX87-NEXT:    popl %edi
-; X86-I64-NOX87-NEXT:    popl %ebx
 ; X86-I64-NOX87-NEXT:    popl %ebp
 ; X86-I64-NOX87-NEXT:    retl $4
 ;
@@ -1902,35 +1810,38 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
-; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl 12(%ebp), %edi
-; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
-; X86-SSE2-NEXT:    movl 20(%ebp), %esi
-; X86-SSE2-NEXT:    pushl 36(%ebp)
-; X86-SSE2-NEXT:    pushl 32(%ebp)
-; X86-SSE2-NEXT:    pushl 28(%ebp)
-; X86-SSE2-NEXT:    pushl 24(%ebp)
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 32(%ebp), %eax
+; X86-SSE2-NEXT:    movl 36(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl 16(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 20(%ebp), %edx
+; X86-SSE2-NEXT:    movl 8(%ebp), %esi
+; X86-SSE2-NEXT:    movl 12(%ebp), %edi
 ; X86-SSE2-NEXT:    movd %eax, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edx
+; X86-SSE2-NEXT:    pushl %ecx
 ; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movd %eax, %xmm0
 ; X86-SSE2-NEXT:    punpckldq (%esp), %xmm0 # 16-byte Folded Reload
 ; X86-SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; X86-SSE2-NEXT:    leal -12(%ebp), %esp
+; X86-SSE2-NEXT:    leal -8(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2000,42 +1911,53 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
 ; X86-I32-NEXT:    pushl %esi
 ; X86-I32-NEXT:    andl $-16, %esp
 ; X86-I32-NEXT:    subl $16, %esp
-; X86-I32-NEXT:    movl 8(%ebp), %esi
-; X86-I32-NEXT:    movl 36(%ebp), %ebx
-; X86-I32-NEXT:    movl 40(%ebp), %edi
-; X86-I32-NEXT:    pushl 24(%ebp)
-; X86-I32-NEXT:    pushl 20(%ebp)
-; X86-I32-NEXT:    pushl 16(%ebp)
-; X86-I32-NEXT:    pushl 12(%ebp)
+; X86-I32-NEXT:    movl 36(%ebp), %eax
+; X86-I32-NEXT:    movl 40(%ebp), %ecx
+; X86-I32-NEXT:    movl 28(%ebp), %edx
+; X86-I32-NEXT:    movl 32(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-I32-NEXT:    movl %eax, %esi
+; X86-I32-NEXT:    movl 52(%ebp), %eax
+; X86-I32-NEXT:    movl 56(%ebp), %ecx
+; X86-I32-NEXT:    movl 44(%ebp), %edx
+; X86-I32-NEXT:    movl 48(%ebp), %edi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
 ; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl %ebx
-; X86-I32-NEXT:    pushl 32(%ebp)
-; X86-I32-NEXT:    pushl 28(%ebp)
-; X86-I32-NEXT:    calll lrintl
-; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    movl %eax, %ebx
-; X86-I32-NEXT:    pushl 56(%ebp)
-; X86-I32-NEXT:    pushl 52(%ebp)
-; X86-I32-NEXT:    pushl 48(%ebp)
-; X86-I32-NEXT:    pushl 44(%ebp)
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, %edi
-; X86-I32-NEXT:    pushl 72(%ebp)
-; X86-I32-NEXT:    pushl 68(%ebp)
-; X86-I32-NEXT:    pushl 64(%ebp)
-; X86-I32-NEXT:    pushl 60(%ebp)
+; X86-I32-NEXT:    movl 68(%ebp), %eax
+; X86-I32-NEXT:    movl 72(%ebp), %ecx
+; X86-I32-NEXT:    movl 60(%ebp), %edx
+; X86-I32-NEXT:    movl 64(%ebp), %ebx
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %ebx
+; X86-I32-NEXT:    pushl %edx
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    movl 20(%ebp), %ecx
+; X86-I32-NEXT:    movl 24(%ebp), %edx
+; X86-I32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl 12(%ebp), %ecx
+; X86-I32-NEXT:    movl 16(%ebp), %edx
+; X86-I32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl %ecx, (%esp)
+; X86-I32-NEXT:    movl 8(%ebp), %ebx
+; X86-I32-NEXT:    movl %eax, 12(%ebx)
+; X86-I32-NEXT:    movl %edi, 8(%ebx)
+; X86-I32-NEXT:    movl %esi, 4(%ebx)
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    movl %eax, 12(%esi)
-; X86-I32-NEXT:    movl %edi, 8(%esi)
-; X86-I32-NEXT:    movl %ebx, 4(%esi)
-; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I32-NEXT:    movl %eax, (%esi)
-; X86-I32-NEXT:    movl %esi, %eax
+; X86-I32-NEXT:    movl %eax, (%ebx)
+; X86-I32-NEXT:    movl %ebx, %eax
 ; X86-I32-NEXT:    leal -12(%ebp), %esp
 ; X86-I32-NEXT:    popl %esi
 ; X86-I32-NEXT:    popl %edi
@@ -2052,42 +1974,53 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $16, %esp
-; X86-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-NOX87-NEXT:    movl 36(%ebp), %ebx
-; X86-NOX87-NEXT:    movl 40(%ebp), %edi
-; X86-NOX87-NEXT:    pushl 24(%ebp)
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
+; X86-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOX87-NEXT:    movl %eax, %esi
+; X86-NOX87-NEXT:    movl 52(%ebp), %eax
+; X86-NOX87-NEXT:    movl 56(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 44(%ebp), %edx
+; X86-NOX87-NEXT:    movl 48(%ebp), %edi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
 ; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl 32(%ebp)
-; X86-NOX87-NEXT:    pushl 28(%ebp)
-; X86-NOX87-NEXT:    calll lrintl
-; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    pushl 56(%ebp)
-; X86-NOX87-NEXT:    pushl 52(%ebp)
-; X86-NOX87-NEXT:    pushl 48(%ebp)
-; X86-NOX87-NEXT:    pushl 44(%ebp)
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    pushl 72(%ebp)
-; X86-NOX87-NEXT:    pushl 68(%ebp)
-; X86-NOX87-NEXT:    pushl 64(%ebp)
-; X86-NOX87-NEXT:    pushl 60(%ebp)
+; X86-NOX87-NEXT:    movl 68(%ebp), %eax
+; X86-NOX87-NEXT:    movl 72(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 60(%ebp), %edx
+; X86-NOX87-NEXT:    movl 64(%ebp), %ebx
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %ebx
+; X86-NOX87-NEXT:    pushl %edx
+; X86-NOX87-NEXT:    calll lrintl
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 24(%ebp), %edx
+; X86-NOX87-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 16(%ebp), %edx
+; X86-NOX87-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl 8(%ebp), %ebx
+; X86-NOX87-NEXT:    movl %eax, 12(%ebx)
+; X86-NOX87-NEXT:    movl %edi, 8(%ebx)
+; X86-NOX87-NEXT:    movl %esi, 4(%ebx)
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, 12(%esi)
-; X86-NOX87-NEXT:    movl %edi, 8(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 4(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NOX87-NEXT:    movl %eax, (%esi)
-; X86-NOX87-NEXT:    movl %esi, %eax
+; X86-NOX87-NEXT:    movl %eax, (%ebx)
+; X86-NOX87-NEXT:    movl %ebx, %eax
 ; X86-NOX87-NEXT:    leal -12(%ebp), %esp
 ; X86-NOX87-NEXT:    popl %esi
 ; X86-NOX87-NEXT:    popl %edi
@@ -2103,53 +2036,62 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
 ; X86-I64-NEXT:    pushl %edi
 ; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-16, %esp
-; X86-I64-NEXT:    subl $32, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %esi
-; X86-I64-NEXT:    movl 36(%ebp), %edi
-; X86-I64-NEXT:    movl 40(%ebp), %ebx
-; X86-I64-NEXT:    pushl 24(%ebp)
-; X86-I64-NEXT:    pushl 20(%ebp)
-; X86-I64-NEXT:    pushl 16(%ebp)
-; X86-I64-NEXT:    pushl 12(%ebp)
-; X86-I64-NEXT:    calll lrintl
-; X86-I64-NEXT:    addl $16, %esp
-; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl 32(%ebp)
-; X86-I64-NEXT:    pushl 28(%ebp)
+; X86-I64-NEXT:    subl $16, %esp
+; X86-I64-NEXT:    movl 36(%ebp), %eax
+; X86-I64-NEXT:    movl 40(%ebp), %ecx
+; X86-I64-NEXT:    movl 28(%ebp), %edx
+; X86-I64-NEXT:    movl 32(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl 56(%ebp)
-; X86-I64-NEXT:    pushl 52(%ebp)
-; X86-I64-NEXT:    pushl 48(%ebp)
-; X86-I64-NEXT:    pushl 44(%ebp)
+; X86-I64-NEXT:    movl 52(%ebp), %eax
+; X86-I64-NEXT:    movl 56(%ebp), %ecx
+; X86-I64-NEXT:    movl 44(%ebp), %edx
+; X86-I64-NEXT:    movl 48(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
-; X86-I64-NEXT:    movl %eax, %edi
-; X86-I64-NEXT:    movl %edx, %ebx
-; X86-I64-NEXT:    pushl 72(%ebp)
-; X86-I64-NEXT:    pushl 68(%ebp)
-; X86-I64-NEXT:    pushl 64(%ebp)
-; X86-I64-NEXT:    pushl 60(%ebp)
+; X86-I64-NEXT:    movl %eax, %ebx
+; X86-I64-NEXT:    movl %edx, %esi
+; X86-I64-NEXT:    movl 68(%ebp), %eax
+; X86-I64-NEXT:    movl 72(%ebp), %ecx
+; X86-I64-NEXT:    movl 60(%ebp), %edx
+; X86-I64-NEXT:    movl 64(%ebp), %edi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %edi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
-; X86-I64-NEXT:    addl $16, %esp
-; X86-I64-NEXT:    movl %edx, 28(%esi)
-; X86-I64-NEXT:    movl %eax, 24(%esi)
-; X86-I64-NEXT:    movl %ebx, 20(%esi)
-; X86-I64-NEXT:    movl %edi, 16(%esi)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NEXT:    movl %eax, 12(%esi)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NEXT:    movl 24(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 16(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 12(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, (%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %edi
+; X86-I64-NEXT:    movl %edx, 28(%edi)
+; X86-I64-NEXT:    movl %eax, 24(%edi)
+; X86-I64-NEXT:    movl %esi, 20(%edi)
+; X86-I64-NEXT:    movl %ebx, 16(%edi)
 ; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NEXT:    movl %eax, 4(%esi)
+; X86-I64-NEXT:    movl %eax, 12(%edi)
 ; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NEXT:    movl %eax, (%esi)
-; X86-I64-NEXT:    movl %esi, %eax
+; X86-I64-NEXT:    movl %eax, 8(%edi)
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %edx, 4(%edi)
+; X86-I64-NEXT:    movl %eax, (%edi)
+; X86-I64-NEXT:    movl %edi, %eax
 ; X86-I64-NEXT:    leal -12(%ebp), %esp
 ; X86-I64-NEXT:    popl %esi
 ; X86-I64-NEXT:    popl %edi
@@ -2165,53 +2107,62 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
 ; X86-I64-NOX87-NEXT:    andl $-16, %esp
-; X86-I64-NOX87-NEXT:    subl $32, %esp
-; X86-I64-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-I64-NOX87-NEXT:    movl 36(%ebp), %edi
-; X86-I64-NOX87-NEXT:    movl 40(%ebp), %ebx
-; X86-I64-NOX87-NEXT:    pushl 24(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 20(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 16(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 12(%ebp)
-; X86-I64-NOX87-NEXT:    calll lrintl
-; X86-I64-NOX87-NEXT:    addl $16, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %ebx
-; X86-I64-NOX87-NEXT:    pushl %edi
-; X86-I64-NOX87-NEXT:    pushl 32(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 28(%ebp)
+; X86-I64-NOX87-NEXT:    subl $16, %esp
+; X86-I64-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl 56(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 52(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 48(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 44(%ebp)
+; X86-I64-NOX87-NEXT:    movl 52(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 56(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 44(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 48(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, %edi
-; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl 72(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 68(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 64(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 60(%ebp)
+; X86-I64-NOX87-NEXT:    movl %eax, %ebx
+; X86-I64-NOX87-NEXT:    movl %edx, %esi
+; X86-I64-NOX87-NEXT:    movl 68(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 72(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 60(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 64(%ebp), %edi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %edi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
-; X86-I64-NOX87-NEXT:    addl $16, %esp
-; X86-I64-NOX87-NEXT:    movl %edx, 28(%esi)
-; X86-I64-NOX87-NEXT:    movl %eax, 24(%esi)
-; X86-I64-NOX87-NEXT:    movl %ebx, 20(%esi)
-; X86-I64-NOX87-NEXT:    movl %edi, 16(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 12(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
+; X86-I64-NOX87-NEXT:    movl 24(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 16(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl 8(%ebp), %edi
+; X86-I64-NOX87-NEXT:    movl %edx, 28(%edi)
+; X86-I64-NOX87-NEXT:    movl %eax, 24(%edi)
+; X86-I64-NOX87-NEXT:    movl %esi, 20(%edi)
+; X86-I64-NOX87-NEXT:    movl %ebx, 16(%edi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 4(%esi)
+; X86-I64-NOX87-NEXT:    movl %eax, 12(%edi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
-; X86-I64-NOX87-NEXT:    movl %esi, %eax
+; X86-I64-NOX87-NEXT:    movl %eax, 8(%edi)
+; X86-I64-NOX87-NEXT:    calll lrintl
+; X86-I64-NOX87-NEXT:    addl $16, %esp
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%edi)
+; X86-I64-NOX87-NEXT:    movl %eax, (%edi)
+; X86-I64-NOX87-NEXT:    movl %edi, %eax
 ; X86-I64-NOX87-NEXT:    leal -12(%ebp), %esp
 ; X86-I64-NOX87-NEXT:    popl %esi
 ; X86-I64-NOX87-NEXT:    popl %edi
@@ -2228,38 +2179,52 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $48, %esp
-; X86-SSE2-NEXT:    movl 48(%ebp), %edi
-; X86-SSE2-NEXT:    movl 52(%ebp), %ebx
-; X86-SSE2-NEXT:    pushl 36(%ebp)
-; X86-SSE2-NEXT:    pushl 32(%ebp)
-; X86-SSE2-NEXT:    pushl 28(%ebp)
-; X86-SSE2-NEXT:    pushl 24(%ebp)
+; X86-SSE2-NEXT:    movl 32(%ebp), %eax
+; X86-SSE2-NEXT:    movl 36(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, %esi
-; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    movl 48(%ebp), %eax
+; X86-SSE2-NEXT:    movl 52(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 40(%ebp), %edx
+; X86-SSE2-NEXT:    movl 44(%ebp), %edi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl 44(%ebp)
-; X86-SSE2-NEXT:    pushl 40(%ebp)
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, %edi
-; X86-SSE2-NEXT:    pushl 68(%ebp)
-; X86-SSE2-NEXT:    pushl 64(%ebp)
-; X86-SSE2-NEXT:    pushl 60(%ebp)
-; X86-SSE2-NEXT:    pushl 56(%ebp)
+; X86-SSE2-NEXT:    movl 64(%ebp), %eax
+; X86-SSE2-NEXT:    movl 68(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 56(%ebp), %edx
+; X86-SSE2-NEXT:    movl 60(%ebp), %ebx
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl 16(%ebp), %ecx
 ; X86-SSE2-NEXT:    movd %eax, %xmm0
+; X86-SSE2-NEXT:    movl 20(%ebp), %eax
 ; X86-SSE2-NEXT:    movd %edi, %xmm1
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE2-NEXT:    movd %esi, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movd %eax, %xmm0
@@ -2437,67 +2402,93 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-I32-NEXT:    pushl %esi
 ; X86-I32-NEXT:    andl $-16, %esp
 ; X86-I32-NEXT:    subl $32, %esp
-; X86-I32-NEXT:    movl 8(%ebp), %esi
-; X86-I32-NEXT:    movl 36(%ebp), %ebx
-; X86-I32-NEXT:    movl 40(%ebp), %edi
-; X86-I32-NEXT:    pushl 24(%ebp)
-; X86-I32-NEXT:    pushl 20(%ebp)
-; X86-I32-NEXT:    pushl 16(%ebp)
-; X86-I32-NEXT:    pushl 12(%ebp)
+; X86-I32-NEXT:    movl 36(%ebp), %eax
+; X86-I32-NEXT:    movl 40(%ebp), %ecx
+; X86-I32-NEXT:    movl 28(%ebp), %edx
+; X86-I32-NEXT:    movl 32(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I32-NEXT:    pushl %edi
-; X86-I32-NEXT:    pushl %ebx
-; X86-I32-NEXT:    pushl 32(%ebp)
-; X86-I32-NEXT:    pushl 28(%ebp)
+; X86-I32-NEXT:    movl 52(%ebp), %eax
+; X86-I32-NEXT:    movl 56(%ebp), %ecx
+; X86-I32-NEXT:    movl 44(%ebp), %edx
+; X86-I32-NEXT:    movl 48(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I32-NEXT:    pushl 56(%ebp)
-; X86-I32-NEXT:    pushl 52(%ebp)
-; X86-I32-NEXT:    pushl 48(%ebp)
-; X86-I32-NEXT:    pushl 44(%ebp)
+; X86-I32-NEXT:    movl 68(%ebp), %eax
+; X86-I32-NEXT:    movl 72(%ebp), %ecx
+; X86-I32-NEXT:    movl 60(%ebp), %edx
+; X86-I32-NEXT:    movl 64(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I32-NEXT:    pushl 72(%ebp)
-; X86-I32-NEXT:    pushl 68(%ebp)
-; X86-I32-NEXT:    pushl 64(%ebp)
-; X86-I32-NEXT:    pushl 60(%ebp)
+; X86-I32-NEXT:    movl 84(%ebp), %eax
+; X86-I32-NEXT:    movl 88(%ebp), %ecx
+; X86-I32-NEXT:    movl 76(%ebp), %edx
+; X86-I32-NEXT:    movl 80(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I32-NEXT:    pushl 88(%ebp)
-; X86-I32-NEXT:    pushl 84(%ebp)
-; X86-I32-NEXT:    pushl 80(%ebp)
-; X86-I32-NEXT:    pushl 76(%ebp)
+; X86-I32-NEXT:    movl 100(%ebp), %eax
+; X86-I32-NEXT:    movl 104(%ebp), %ecx
+; X86-I32-NEXT:    movl 92(%ebp), %edx
+; X86-I32-NEXT:    movl 96(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I32-NEXT:    pushl 104(%ebp)
-; X86-I32-NEXT:    pushl 100(%ebp)
-; X86-I32-NEXT:    pushl 96(%ebp)
-; X86-I32-NEXT:    pushl 92(%ebp)
+; X86-I32-NEXT:    movl %eax, %edi
+; X86-I32-NEXT:    movl 116(%ebp), %eax
+; X86-I32-NEXT:    movl 120(%ebp), %ecx
+; X86-I32-NEXT:    movl 108(%ebp), %edx
+; X86-I32-NEXT:    movl 112(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
 ; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, %ebx
-; X86-I32-NEXT:    pushl 120(%ebp)
-; X86-I32-NEXT:    pushl 116(%ebp)
-; X86-I32-NEXT:    pushl 112(%ebp)
-; X86-I32-NEXT:    pushl 108(%ebp)
-; X86-I32-NEXT:    calll lrintl
-; X86-I32-NEXT:    addl $16, %esp
-; X86-I32-NEXT:    movl %eax, %edi
-; X86-I32-NEXT:    pushl 136(%ebp)
-; X86-I32-NEXT:    pushl 132(%ebp)
-; X86-I32-NEXT:    pushl 128(%ebp)
-; X86-I32-NEXT:    pushl 124(%ebp)
+; X86-I32-NEXT:    movl 132(%ebp), %eax
+; X86-I32-NEXT:    movl 136(%ebp), %ecx
+; X86-I32-NEXT:    movl 124(%ebp), %edx
+; X86-I32-NEXT:    movl 128(%ebp), %esi
+; X86-I32-NEXT:    pushl %ecx
+; X86-I32-NEXT:    pushl %eax
+; X86-I32-NEXT:    pushl %esi
+; X86-I32-NEXT:    pushl %edx
 ; X86-I32-NEXT:    calll lrintl
-; X86-I32-NEXT:    addl $16, %esp
+; X86-I32-NEXT:    movl 24(%ebp), %ecx
+; X86-I32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl 20(%ebp), %ecx
+; X86-I32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl 16(%ebp), %ecx
+; X86-I32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I32-NEXT:    movl 12(%ebp), %ecx
+; X86-I32-NEXT:    movl %ecx, (%esp)
+; X86-I32-NEXT:    movl 8(%ebp), %esi
 ; X86-I32-NEXT:    movl %eax, 28(%esi)
-; X86-I32-NEXT:    movl %edi, 24(%esi)
-; X86-I32-NEXT:    movl %ebx, 20(%esi)
+; X86-I32-NEXT:    movl %ebx, 24(%esi)
+; X86-I32-NEXT:    movl %edi, 20(%esi)
 ; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I32-NEXT:    movl %eax, 16(%esi)
 ; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2506,7 +2497,8 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-I32-NEXT:    movl %eax, 8(%esi)
 ; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I32-NEXT:    movl %eax, 4(%esi)
-; X86-I32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I32-NEXT:    calll lrintl
+; X86-I32-NEXT:    addl $16, %esp
 ; X86-I32-NEXT:    movl %eax, (%esi)
 ; X86-I32-NEXT:    movl %esi, %eax
 ; X86-I32-NEXT:    leal -12(%ebp), %esp
@@ -2525,67 +2517,93 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    andl $-16, %esp
 ; X86-NOX87-NEXT:    subl $32, %esp
-; X86-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-NOX87-NEXT:    movl 36(%ebp), %ebx
-; X86-NOX87-NEXT:    movl 40(%ebp), %edi
-; X86-NOX87-NEXT:    pushl 24(%ebp)
-; X86-NOX87-NEXT:    pushl 20(%ebp)
-; X86-NOX87-NEXT:    pushl 16(%ebp)
-; X86-NOX87-NEXT:    pushl 12(%ebp)
+; X86-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl %edi
-; X86-NOX87-NEXT:    pushl %ebx
-; X86-NOX87-NEXT:    pushl 32(%ebp)
-; X86-NOX87-NEXT:    pushl 28(%ebp)
+; X86-NOX87-NEXT:    movl 52(%ebp), %eax
+; X86-NOX87-NEXT:    movl 56(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 44(%ebp), %edx
+; X86-NOX87-NEXT:    movl 48(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 56(%ebp)
-; X86-NOX87-NEXT:    pushl 52(%ebp)
-; X86-NOX87-NEXT:    pushl 48(%ebp)
-; X86-NOX87-NEXT:    pushl 44(%ebp)
+; X86-NOX87-NEXT:    movl 68(%ebp), %eax
+; X86-NOX87-NEXT:    movl 72(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 60(%ebp), %edx
+; X86-NOX87-NEXT:    movl 64(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 72(%ebp)
-; X86-NOX87-NEXT:    pushl 68(%ebp)
-; X86-NOX87-NEXT:    pushl 64(%ebp)
-; X86-NOX87-NEXT:    pushl 60(%ebp)
+; X86-NOX87-NEXT:    movl 84(%ebp), %eax
+; X86-NOX87-NEXT:    movl 88(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 76(%ebp), %edx
+; X86-NOX87-NEXT:    movl 80(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 88(%ebp)
-; X86-NOX87-NEXT:    pushl 84(%ebp)
-; X86-NOX87-NEXT:    pushl 80(%ebp)
-; X86-NOX87-NEXT:    pushl 76(%ebp)
+; X86-NOX87-NEXT:    movl 100(%ebp), %eax
+; X86-NOX87-NEXT:    movl 104(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 92(%ebp), %edx
+; X86-NOX87-NEXT:    movl 96(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NOX87-NEXT:    pushl 104(%ebp)
-; X86-NOX87-NEXT:    pushl 100(%ebp)
-; X86-NOX87-NEXT:    pushl 96(%ebp)
-; X86-NOX87-NEXT:    pushl 92(%ebp)
+; X86-NOX87-NEXT:    movl %eax, %edi
+; X86-NOX87-NEXT:    movl 116(%ebp), %eax
+; X86-NOX87-NEXT:    movl 120(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 108(%ebp), %edx
+; X86-NOX87-NEXT:    movl 112(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
 ; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, %ebx
-; X86-NOX87-NEXT:    pushl 120(%ebp)
-; X86-NOX87-NEXT:    pushl 116(%ebp)
-; X86-NOX87-NEXT:    pushl 112(%ebp)
-; X86-NOX87-NEXT:    pushl 108(%ebp)
-; X86-NOX87-NEXT:    calll lrintl
-; X86-NOX87-NEXT:    addl $16, %esp
-; X86-NOX87-NEXT:    movl %eax, %edi
-; X86-NOX87-NEXT:    pushl 136(%ebp)
-; X86-NOX87-NEXT:    pushl 132(%ebp)
-; X86-NOX87-NEXT:    pushl 128(%ebp)
-; X86-NOX87-NEXT:    pushl 124(%ebp)
+; X86-NOX87-NEXT:    movl 132(%ebp), %eax
+; X86-NOX87-NEXT:    movl 136(%ebp), %ecx
+; X86-NOX87-NEXT:    movl 124(%ebp), %edx
+; X86-NOX87-NEXT:    movl 128(%ebp), %esi
+; X86-NOX87-NEXT:    pushl %ecx
+; X86-NOX87-NEXT:    pushl %eax
+; X86-NOX87-NEXT:    pushl %esi
+; X86-NOX87-NEXT:    pushl %edx
 ; X86-NOX87-NEXT:    calll lrintl
-; X86-NOX87-NEXT:    addl $16, %esp
+; X86-NOX87-NEXT:    movl 24(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 16(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-NOX87-NEXT:    movl 8(%ebp), %esi
 ; X86-NOX87-NEXT:    movl %eax, 28(%esi)
-; X86-NOX87-NEXT:    movl %edi, 24(%esi)
-; X86-NOX87-NEXT:    movl %ebx, 20(%esi)
+; X86-NOX87-NEXT:    movl %ebx, 24(%esi)
+; X86-NOX87-NEXT:    movl %edi, 20(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl %eax, 16(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2594,7 +2612,8 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-NOX87-NEXT:    movl %eax, 8(%esi)
 ; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOX87-NEXT:    movl %eax, 4(%esi)
-; X86-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOX87-NEXT:    calll lrintl
+; X86-NOX87-NEXT:    addl $16, %esp
 ; X86-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-NOX87-NEXT:    movl %esi, %eax
 ; X86-NOX87-NEXT:    leal -12(%ebp), %esp
@@ -2612,72 +2631,97 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-I64-NEXT:    pushl %edi
 ; X86-I64-NEXT:    pushl %esi
 ; X86-I64-NEXT:    andl $-16, %esp
-; X86-I64-NEXT:    subl $64, %esp
-; X86-I64-NEXT:    movl 8(%ebp), %esi
-; X86-I64-NEXT:    movl 36(%ebp), %edi
-; X86-I64-NEXT:    movl 40(%ebp), %ebx
-; X86-I64-NEXT:    pushl 24(%ebp)
-; X86-I64-NEXT:    pushl 20(%ebp)
-; X86-I64-NEXT:    pushl 16(%ebp)
-; X86-I64-NEXT:    pushl 12(%ebp)
-; X86-I64-NEXT:    calll lrintl
-; X86-I64-NEXT:    addl $16, %esp
-; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl %ebx
-; X86-I64-NEXT:    pushl %edi
-; X86-I64-NEXT:    pushl 32(%ebp)
-; X86-I64-NEXT:    pushl 28(%ebp)
+; X86-I64-NEXT:    subl $48, %esp
+; X86-I64-NEXT:    movl 36(%ebp), %eax
+; X86-I64-NEXT:    movl 40(%ebp), %ecx
+; X86-I64-NEXT:    movl 28(%ebp), %edx
+; X86-I64-NEXT:    movl 32(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl 56(%ebp)
-; X86-I64-NEXT:    pushl 52(%ebp)
-; X86-I64-NEXT:    pushl 48(%ebp)
-; X86-I64-NEXT:    pushl 44(%ebp)
+; X86-I64-NEXT:    movl 52(%ebp), %eax
+; X86-I64-NEXT:    movl 56(%ebp), %ecx
+; X86-I64-NEXT:    movl 44(%ebp), %edx
+; X86-I64-NEXT:    movl 48(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl 72(%ebp)
-; X86-I64-NEXT:    pushl 68(%ebp)
-; X86-I64-NEXT:    pushl 64(%ebp)
-; X86-I64-NEXT:    pushl 60(%ebp)
+; X86-I64-NEXT:    movl 68(%ebp), %eax
+; X86-I64-NEXT:    movl 72(%ebp), %ecx
+; X86-I64-NEXT:    movl 60(%ebp), %edx
+; X86-I64-NEXT:    movl 64(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl 88(%ebp)
-; X86-I64-NEXT:    pushl 84(%ebp)
-; X86-I64-NEXT:    pushl 80(%ebp)
-; X86-I64-NEXT:    pushl 76(%ebp)
+; X86-I64-NEXT:    movl 84(%ebp), %eax
+; X86-I64-NEXT:    movl 88(%ebp), %ecx
+; X86-I64-NEXT:    movl 76(%ebp), %edx
+; X86-I64-NEXT:    movl 80(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl 104(%ebp)
-; X86-I64-NEXT:    pushl 100(%ebp)
-; X86-I64-NEXT:    pushl 96(%ebp)
-; X86-I64-NEXT:    pushl 92(%ebp)
+; X86-I64-NEXT:    movl 100(%ebp), %eax
+; X86-I64-NEXT:    movl 104(%ebp), %ecx
+; X86-I64-NEXT:    movl 92(%ebp), %edx
+; X86-I64-NEXT:    movl 96(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NEXT:    pushl 120(%ebp)
-; X86-I64-NEXT:    pushl 116(%ebp)
-; X86-I64-NEXT:    pushl 112(%ebp)
-; X86-I64-NEXT:    pushl 108(%ebp)
+; X86-I64-NEXT:    movl 116(%ebp), %eax
+; X86-I64-NEXT:    movl 120(%ebp), %ecx
+; X86-I64-NEXT:    movl 108(%ebp), %edx
+; X86-I64-NEXT:    movl 112(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
 ; X86-I64-NEXT:    addl $16, %esp
 ; X86-I64-NEXT:    movl %eax, %edi
 ; X86-I64-NEXT:    movl %edx, %ebx
-; X86-I64-NEXT:    pushl 136(%ebp)
-; X86-I64-NEXT:    pushl 132(%ebp)
-; X86-I64-NEXT:    pushl 128(%ebp)
-; X86-I64-NEXT:    pushl 124(%ebp)
+; X86-I64-NEXT:    movl 132(%ebp), %eax
+; X86-I64-NEXT:    movl 136(%ebp), %ecx
+; X86-I64-NEXT:    movl 124(%ebp), %edx
+; X86-I64-NEXT:    movl 128(%ebp), %esi
+; X86-I64-NEXT:    pushl %ecx
+; X86-I64-NEXT:    pushl %eax
+; X86-I64-NEXT:    pushl %esi
+; X86-I64-NEXT:    pushl %edx
 ; X86-I64-NEXT:    calll lrintl
-; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl 24(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 16(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NEXT:    movl 12(%ebp), %ecx
+; X86-I64-NEXT:    movl %ecx, (%esp)
+; X86-I64-NEXT:    movl 8(%ebp), %esi
 ; X86-I64-NEXT:    movl %edx, 60(%esi)
 ; X86-I64-NEXT:    movl %eax, 56(%esi)
 ; X86-I64-NEXT:    movl %ebx, 52(%esi)
@@ -2702,9 +2746,9 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-I64-NEXT:    movl %eax, 12(%esi)
 ; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I64-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NEXT:    movl %eax, 4(%esi)
-; X86-I64-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NEXT:    calll lrintl
+; X86-I64-NEXT:    addl $16, %esp
+; X86-I64-NEXT:    movl %edx, 4(%esi)
 ; X86-I64-NEXT:    movl %eax, (%esi)
 ; X86-I64-NEXT:    movl %esi, %eax
 ; X86-I64-NEXT:    leal -12(%ebp), %esp
@@ -2722,72 +2766,97 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    pushl %edi
 ; X86-I64-NOX87-NEXT:    pushl %esi
 ; X86-I64-NOX87-NEXT:    andl $-16, %esp
-; X86-I64-NOX87-NEXT:    subl $64, %esp
-; X86-I64-NOX87-NEXT:    movl 8(%ebp), %esi
-; X86-I64-NOX87-NEXT:    movl 36(%ebp), %edi
-; X86-I64-NOX87-NEXT:    movl 40(%ebp), %ebx
-; X86-I64-NOX87-NEXT:    pushl 24(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 20(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 16(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 12(%ebp)
-; X86-I64-NOX87-NEXT:    calll lrintl
-; X86-I64-NOX87-NEXT:    addl $16, %esp
-; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl %ebx
-; X86-I64-NOX87-NEXT:    pushl %edi
-; X86-I64-NOX87-NEXT:    pushl 32(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 28(%ebp)
+; X86-I64-NOX87-NEXT:    subl $48, %esp
+; X86-I64-NOX87-NEXT:    movl 36(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 40(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 28(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 32(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl 56(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 52(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 48(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 44(%ebp)
+; X86-I64-NOX87-NEXT:    movl 52(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 56(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 44(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 48(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl 72(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 68(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 64(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 60(%ebp)
+; X86-I64-NOX87-NEXT:    movl 68(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 72(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 60(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 64(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl 88(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 84(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 80(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 76(%ebp)
+; X86-I64-NOX87-NEXT:    movl 84(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 88(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 76(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 80(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl 104(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 100(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 96(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 92(%ebp)
+; X86-I64-NOX87-NEXT:    movl 100(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 104(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 92(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 96(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-I64-NOX87-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-I64-NOX87-NEXT:    pushl 120(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 116(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 112(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 108(%ebp)
+; X86-I64-NOX87-NEXT:    movl 116(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 120(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 108(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 112(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
 ; X86-I64-NOX87-NEXT:    addl $16, %esp
 ; X86-I64-NOX87-NEXT:    movl %eax, %edi
 ; X86-I64-NOX87-NEXT:    movl %edx, %ebx
-; X86-I64-NOX87-NEXT:    pushl 136(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 132(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 128(%ebp)
-; X86-I64-NOX87-NEXT:    pushl 124(%ebp)
+; X86-I64-NOX87-NEXT:    movl 132(%ebp), %eax
+; X86-I64-NOX87-NEXT:    movl 136(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl 124(%ebp), %edx
+; X86-I64-NOX87-NEXT:    movl 128(%ebp), %esi
+; X86-I64-NOX87-NEXT:    pushl %ecx
+; X86-I64-NOX87-NEXT:    pushl %eax
+; X86-I64-NOX87-NEXT:    pushl %esi
+; X86-I64-NOX87-NEXT:    pushl %edx
 ; X86-I64-NOX87-NEXT:    calll lrintl
-; X86-I64-NOX87-NEXT:    addl $16, %esp
+; X86-I64-NOX87-NEXT:    movl 24(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 20(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 16(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-I64-NOX87-NEXT:    movl 12(%ebp), %ecx
+; X86-I64-NOX87-NEXT:    movl %ecx, (%esp)
+; X86-I64-NOX87-NEXT:    movl 8(%ebp), %esi
 ; X86-I64-NOX87-NEXT:    movl %edx, 60(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, 56(%esi)
 ; X86-I64-NOX87-NEXT:    movl %ebx, 52(%esi)
@@ -2812,9 +2881,9 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-I64-NOX87-NEXT:    movl %eax, 12(%esi)
 ; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-I64-NOX87-NEXT:    movl %eax, 8(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-I64-NOX87-NEXT:    movl %eax, 4(%esi)
-; X86-I64-NOX87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-I64-NOX87-NEXT:    calll lrintl
+; X86-I64-NOX87-NEXT:    addl $16, %esp
+; X86-I64-NOX87-NEXT:    movl %edx, 4(%esi)
 ; X86-I64-NOX87-NEXT:    movl %eax, (%esi)
 ; X86-I64-NOX87-NEXT:    movl %esi, %eax
 ; X86-I64-NOX87-NEXT:    leal -12(%ebp), %esp
@@ -2832,85 +2901,115 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
 ; X86-SSE2-NEXT:    andl $-16, %esp
-; X86-SSE2-NEXT:    subl $64, %esp
-; X86-SSE2-NEXT:    movl 108(%ebp), %esi
-; X86-SSE2-NEXT:    movl 112(%ebp), %edi
-; X86-SSE2-NEXT:    movl 116(%ebp), %ebx
-; X86-SSE2-NEXT:    pushl 100(%ebp)
-; X86-SSE2-NEXT:    pushl 96(%ebp)
-; X86-SSE2-NEXT:    pushl 92(%ebp)
-; X86-SSE2-NEXT:    pushl 88(%ebp)
+; X86-SSE2-NEXT:    subl $80, %esp
+; X86-SSE2-NEXT:    movl 96(%ebp), %eax
+; X86-SSE2-NEXT:    movl 100(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 88(%ebp), %edx
+; X86-SSE2-NEXT:    movl 92(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    movl 112(%ebp), %eax
+; X86-SSE2-NEXT:    movl 116(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 104(%ebp), %edx
+; X86-SSE2-NEXT:    movl 108(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    pushl 104(%ebp)
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    pushl 132(%ebp)
-; X86-SSE2-NEXT:    pushl 128(%ebp)
-; X86-SSE2-NEXT:    pushl 124(%ebp)
-; X86-SSE2-NEXT:    pushl 120(%ebp)
+; X86-SSE2-NEXT:    movl 128(%ebp), %eax
+; X86-SSE2-NEXT:    movl 132(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 120(%ebp), %edx
+; X86-SSE2-NEXT:    movl 124(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT:    pushl 20(%ebp)
-; X86-SSE2-NEXT:    pushl 16(%ebp)
-; X86-SSE2-NEXT:    pushl 12(%ebp)
-; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 16(%ebp), %eax
+; X86-SSE2-NEXT:    movl 20(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 8(%ebp), %edx
+; X86-SSE2-NEXT:    movl 12(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
-; X86-SSE2-NEXT:    movl %eax, %esi
-; X86-SSE2-NEXT:    pushl 36(%ebp)
-; X86-SSE2-NEXT:    pushl 32(%ebp)
-; X86-SSE2-NEXT:    pushl 28(%ebp)
-; X86-SSE2-NEXT:    pushl 24(%ebp)
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 32(%ebp), %eax
+; X86-SSE2-NEXT:    movl 36(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 28(%ebp), %edi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, %edi
-; X86-SSE2-NEXT:    pushl 52(%ebp)
-; X86-SSE2-NEXT:    pushl 48(%ebp)
-; X86-SSE2-NEXT:    pushl 44(%ebp)
-; X86-SSE2-NEXT:    pushl 40(%ebp)
+; X86-SSE2-NEXT:    movl 48(%ebp), %eax
+; X86-SSE2-NEXT:    movl 52(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 40(%ebp), %edx
+; X86-SSE2-NEXT:    movl 44(%ebp), %ebx
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movl %eax, %ebx
-; X86-SSE2-NEXT:    pushl 68(%ebp)
-; X86-SSE2-NEXT:    pushl 64(%ebp)
-; X86-SSE2-NEXT:    pushl 60(%ebp)
-; X86-SSE2-NEXT:    pushl 56(%ebp)
+; X86-SSE2-NEXT:    movl 64(%ebp), %esi
+; X86-SSE2-NEXT:    movl 68(%ebp), %ecx
+; X86-SSE2-NEXT:    movl 56(%ebp), %edx
+; X86-SSE2-NEXT:    movl 60(%ebp), %eax
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movd %eax, %xmm0
 ; X86-SSE2-NEXT:    movd %ebx, %xmm1
 ; X86-SSE2-NEXT:    movd %edi, %xmm2
-; X86-SSE2-NEXT:    movd %esi, %xmm4
-; X86-SSE2-NEXT:    movss (%esp), %xmm3 # 4-byte Reload
-; X86-SSE2-NEXT:    # xmm3 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 4-byte Reload
-; X86-SSE2-NEXT:    # xmm5 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 4-byte Reload
-; X86-SSE2-NEXT:    # xmm6 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movaps %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; X86-SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE2-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; X86-SSE2-NEXT:    movaps %xmm5, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    pushl 84(%ebp)
-; X86-SSE2-NEXT:    pushl 80(%ebp)
-; X86-SSE2-NEXT:    pushl 76(%ebp)
-; X86-SSE2-NEXT:    pushl 72(%ebp)
+; X86-SSE2-NEXT:    movd {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 4-byte Folded Reload
+; X86-SSE2-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movl 80(%ebp), %eax
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movl 84(%ebp), %ecx
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
+; X86-SSE2-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
+; X86-SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movl 72(%ebp), %edx
+; X86-SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
+; X86-SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-SSE2-NEXT:    movl 76(%ebp), %esi
+; X86-SSE2-NEXT:    pushl %ecx
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    pushl %edx
 ; X86-SSE2-NEXT:    calll lrintl
 ; X86-SSE2-NEXT:    addl $16, %esp
 ; X86-SSE2-NEXT:    movd %eax, %xmm1
 ; X86-SSE2-NEXT:    punpckldq {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; X86-SSE2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; X86-SSE2-NEXT:    punpcklqdq (%esp), %xmm1 # 16-byte Folded Reload
+; X86-SSE2-NEXT:    punpcklqdq {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; X86-SSE2-NEXT:    # xmm1 = xmm1[0],mem[0]
 ; X86-SSE2-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; X86-SSE2-NEXT:    leal -12(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 817a6959f1e08..27fdb79f8d59a 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -820,13 +820,13 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; X86-SSE-NEXT:    movl %esp, %ebp
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE-NEXT:    paddw %xmm3, %xmm3
-; X86-SSE-NEXT:    paddw %xmm3, %xmm1
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE-NEXT:    paddw %xmm3, %xmm3
 ; X86-SSE-NEXT:    paddw %xmm2, %xmm0
 ; X86-SSE-NEXT:    paddw %xmm3, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    paddw %xmm2, %xmm2
+; X86-SSE-NEXT:    paddw %xmm2, %xmm1
 ; X86-SSE-NEXT:    paddw 8(%ebp), %xmm1
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -2023,10 +2023,10 @@ define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) {
 ; X86-SSE2-LABEL: mul_v2i64_zext_cross_bb:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
@@ -2035,8 +2035,8 @@ define <2 x i64> @mul_v2i64_zext_cross_bb(ptr %in, ptr %y) {
 ; X86-SSE4-LABEL: mul_v2i64_zext_cross_bb:
 ; X86-SSE4:       # %bb.0:
 ; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-SSE4-NEXT:    retl
@@ -2079,29 +2079,29 @@ define <4 x i64> @mul_v4i64_zext_cross_bb(ptr %in, ptr %y) {
 ; X86-SSE2-LABEL: mul_v4i64_zext_cross_bb:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqa (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqa (%eax), %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movdqa (%eax), %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3]
-; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
-; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,1,3]
+; X86-SSE2-NEXT:    pmuludq %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
+; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE4-LABEL: mul_v4i64_zext_cross_bb:
 ; X86-SSE4:       # %bb.0:
 ; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
-; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm1
+; X86-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 ; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm0
+; X86-SSE4-NEXT:    pmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; X86-SSE4-NEXT:    pmuludq %xmm2, %xmm1
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: mul_v4i64_zext_cross_bb:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll
index df6e079785568..4edc9a2e6310a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll
@@ -164,17 +164,17 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pand 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pand %xmm3, %xmm1
-; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand %xmm1, %xmm0
-; X86-SSE-NEXT:    movd %xmm0, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE-NEXT:    pand %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    movd %xmm0, %edx
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -412,18 +412,18 @@ define i32 @test_v32i32(<32 x i32> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pand 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pand %xmm3, %xmm1
-; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE-NEXT:    pand %xmm1, %xmm0
+; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl
@@ -733,21 +733,21 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pand 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pand %xmm3, %xmm1
-; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    pand %xmm1, %xmm0
-; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pand %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -1128,24 +1128,24 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pand 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pand %xmm3, %xmm1
-; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pand 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pand 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE-NEXT:    pand %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrlw $8, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
 ; X86-SSE-NEXT:    pand %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    psrlw $8, %xmm0
+; X86-SSE-NEXT:    pand %xmm1, %xmm0
+; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index f80544fdef7e6..8125d41fce80f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -1349,10 +1349,10 @@ define i1 @icmp0_v32i16_v32i1(<32 x i16>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqw 8(%ebp), %xmm3
 ; X86-SSE2-NEXT:    por %xmm1, %xmm3
 ; X86-SSE2-NEXT:    packsswb %xmm3, %xmm0
@@ -1450,10 +1450,10 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb 8(%ebp), %xmm3
 ; X86-SSE2-NEXT:    por %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
@@ -2303,12 +2303,12 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE2-NEXT:    pcmpeqw 56(%ebp), %xmm2
-; X86-SSE2-NEXT:    pcmpeqw 24(%ebp), %xmm0
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pcmpeqw 72(%ebp), %xmm3
 ; X86-SSE2-NEXT:    pcmpeqw 40(%ebp), %xmm1
 ; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    pcmpeqw 56(%ebp), %xmm2
+; X86-SSE2-NEXT:    pcmpeqw 24(%ebp), %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    packsswb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    testl %eax, %eax
@@ -2403,14 +2403,14 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb 56(%ebp), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb 24(%ebp), %xmm0
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb 72(%ebp), %xmm3
 ; X86-SSE2-NEXT:    pcmpeqb 40(%ebp), %xmm1
 ; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    pcmpeqb 56(%ebp), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb 24(%ebp), %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    testl %eax, %eax
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll
index 3c960f255046c..6a5e73e6a8def 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll
@@ -164,17 +164,17 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    por 56(%ebp), %xmm2
-; X86-SSE-NEXT:    por 24(%ebp), %xmm0
-; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    por 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    por %xmm3, %xmm1
-; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    por 56(%ebp), %xmm2
+; X86-SSE-NEXT:    por 24(%ebp), %xmm0
+; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
-; X86-SSE-NEXT:    movd %xmm0, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE-NEXT:    por %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    movd %xmm0, %edx
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -412,18 +412,18 @@ define i32 @test_v32i32(<32 x i32> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    por 56(%ebp), %xmm2
-; X86-SSE-NEXT:    por 24(%ebp), %xmm0
-; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    por 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    por %xmm3, %xmm1
-; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    por 56(%ebp), %xmm2
+; X86-SSE-NEXT:    por 24(%ebp), %xmm0
+; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE-NEXT:    por %xmm1, %xmm0
+; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl
@@ -733,21 +733,21 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    por 56(%ebp), %xmm2
-; X86-SSE-NEXT:    por 24(%ebp), %xmm0
-; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    por 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    por %xmm3, %xmm1
-; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    por 56(%ebp), %xmm2
+; X86-SSE-NEXT:    por 24(%ebp), %xmm0
+; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
-; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    por %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -1128,24 +1128,24 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    por 56(%ebp), %xmm2
-; X86-SSE-NEXT:    por 24(%ebp), %xmm0
-; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    por 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    por %xmm3, %xmm1
-; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    por 56(%ebp), %xmm2
+; X86-SSE-NEXT:    por 24(%ebp), %xmm0
+; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrlw $8, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
 ; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    psrlw $8, %xmm0
+; X86-SSE-NEXT:    por %xmm1, %xmm0
+; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 6cb43234d713b..fccd571bf0675 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -1796,10 +1796,10 @@ define i1 @icmp0_v32i16_v32i1(<32 x i16>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpeqw %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqw 8(%ebp), %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    packsswb %xmm3, %xmm0
@@ -1917,10 +1917,10 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb 8(%ebp), %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm3
@@ -2885,12 +2885,12 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE2-NEXT:    pcmpeqw 56(%ebp), %xmm2
-; X86-SSE2-NEXT:    pcmpeqw 24(%ebp), %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pcmpeqw 72(%ebp), %xmm3
 ; X86-SSE2-NEXT:    pcmpeqw 40(%ebp), %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    pcmpeqw 56(%ebp), %xmm2
+; X86-SSE2-NEXT:    pcmpeqw 24(%ebp), %xmm0
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    packsswb %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    xorb %ah, %al
@@ -3005,14 +3005,14 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
 ; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb 56(%ebp), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb 24(%ebp), %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb 72(%ebp), %xmm3
 ; X86-SSE2-NEXT:    pcmpeqb 40(%ebp), %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    pcmpeqb 56(%ebp), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb 24(%ebp), %xmm0
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    xorb %ah, %al
 ; X86-SSE2-NEXT:    setnp %al
 ; X86-SSE2-NEXT:    movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
index 33199388fd6fc..7b91d4f5649e6 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
@@ -164,17 +164,17 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pxor 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE-NEXT:    movd %xmm0, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    movd %xmm0, %edx
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -412,18 +412,18 @@ define i32 @test_v32i32(<32 x i32> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pxor 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
 ; X86-SSE-NEXT:    retl
@@ -733,21 +733,21 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pxor 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE-NEXT:    movd %xmm0, %eax
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
@@ -1128,24 +1128,24 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
-; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
-; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor 72(%ebp), %xmm3
 ; X86-SSE-NEXT:    pxor 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    pxor 56(%ebp), %xmm2
+; X86-SSE-NEXT:    pxor 24(%ebp), %xmm0
+; X86-SSE-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    psrlw $8, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
 ; X86-SSE-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE-NEXT:    psrlw $8, %xmm0
+; X86-SSE-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index dd9594b11d96b..d19029888efc8 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -101,14 +101,14 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X86-SSE2-LABEL: var_rotate_v2i64:
 ; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psllq %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    psllq %xmm3, %xmm4
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
 ; X86-SSE2-NEXT:    psubq %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    psllq %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE2-NEXT:    psllq %xmm1, %xmm4
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    psrlq %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -722,12 +722,12 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X86-SSE2-LABEL: splatvar_rotate_v2i64:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movd {{.*#+}} xmm2 = [64,0,0,0]
-; X86-SSE2-NEXT:    psubq %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE2-NEXT:    psllq %xmm1, %xmm3
-; X86-SSE2-NEXT:    psrlq %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm3, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psllq %xmm1, %xmm2
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm3 = [64,0,0,0]
+; X86-SSE2-NEXT:    psubq %xmm1, %xmm3
+; X86-SSE2-NEXT:    psrlq %xmm3, %xmm0
+; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    retl
   %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
   %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat
@@ -999,9 +999,9 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ;
 ; X86-SSE2-LABEL: splatvar_rotate_v16i8:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    psllw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psrlw $8, %xmm2
@@ -1944,8 +1944,8 @@ define <4 x i32> @rot16_demandedbits(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-LABEL: rot16_demandedbits:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $11, %xmm1
-; X86-SSE2-NEXT:    pslld $11, %xmm0
+; X86-SSE2-NEXT:    pslld $11, %xmm1
+; X86-SSE2-NEXT:    psrld $11, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
@@ -2012,8 +2012,8 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-LABEL: rot16_trunc:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $11, %xmm1
-; X86-SSE2-NEXT:    pslld $5, %xmm0
+; X86-SSE2-NEXT:    pslld $5, %xmm1
+; X86-SSE2-NEXT:    psrld $11, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 6ce34991050ac..225b8e26b24f1 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -1958,12 +1958,12 @@ define <4 x i64> @load_sext_4i1_to_4i64(ptr%ptr) {
 ; X86-SSE41-NEXT:    andb $1, %cl
 ; X86-SSE41-NEXT:    movzbl %cl, %ecx
 ; X86-SSE41-NEXT:    pinsrb $8, %ecx, %xmm1
-; X86-SSE41-NEXT:    shrb $3, %al
-; X86-SSE41-NEXT:    movzbl %al, %eax
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1]
-; X86-SSE41-NEXT:    pinsrb $12, %eax, %xmm1
 ; X86-SSE41-NEXT:    pslld $31, %xmm0
 ; X86-SSE41-NEXT:    psrad $31, %xmm0
+; X86-SSE41-NEXT:    shrb $3, %al
+; X86-SSE41-NEXT:    movzbl %al, %eax
+; X86-SSE41-NEXT:    pinsrb $12, %eax, %xmm1
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
 ; X86-SSE41-NEXT:    pslld $31, %xmm1
 ; X86-SSE41-NEXT:    psrad $31, %xmm1
@@ -3517,13 +3517,13 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
 ; X86-SSE-NEXT:    movl %esp, %ebp
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
-; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm3
 ; X86-SSE-NEXT:    pcmpeqw 40(%ebp), %xmm1
 ; X86-SSE-NEXT:    pcmpeqw 24(%ebp), %xmm0
 ; X86-SSE-NEXT:    packsswb %xmm1, %xmm0
-; X86-SSE-NEXT:    pcmpeqw 72(%ebp), %xmm3
+; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm1
+; X86-SSE-NEXT:    pcmpeqw 72(%ebp), %xmm1
 ; X86-SSE-NEXT:    pcmpeqw 56(%ebp), %xmm2
-; X86-SSE-NEXT:    packsswb %xmm3, %xmm2
+; X86-SSE-NEXT:    packsswb %xmm1, %xmm2
 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index a847da6eff3ff..6c6f285b32b2d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1747,15 +1747,15 @@ define <4 x i32> @vector_variable_shift_right(<4 x i1> %cond, <4 x i32> %x, <4 x
 ; X86-SSE-NEXT:    xorps %xmm3, %xmm3
 ; X86-SSE-NEXT:    xorps %xmm4, %xmm4
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
-; X86-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
+; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm2
+; X86-SSE-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE-NEXT:    psrld %xmm4, %xmm5
 ; X86-SSE-NEXT:    pslld $31, %xmm0
 ; X86-SSE-NEXT:    psrad $31, %xmm0
-; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
 ; X86-SSE-NEXT:    psrld %xmm3, %xmm2
-; X86-SSE-NEXT:    psrld %xmm4, %xmm1
 ; X86-SSE-NEXT:    pand %xmm0, %xmm2
-; X86-SSE-NEXT:    pandn %xmm1, %xmm0
+; X86-SSE-NEXT:    pandn %xmm5, %xmm0
 ; X86-SSE-NEXT:    por %xmm2, %xmm0
 ; X86-SSE-NEXT:    movl %ebp, %esp
 ; X86-SSE-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index db3be98efa530..8b49d1d325b70 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -493,10 +493,10 @@ define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) {
 ; X86-AVX512F-LABEL: test_masked_permps_v8f32:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [23,22,19,3,23,22,6,7]
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovaps (%eax), %ymm1
-; X86-AVX512F-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7]
-; X86-AVX512F-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
+; X86-AVX512F-NEXT:    vmovaps (%eax), %ymm2
+; X86-AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 ; X86-AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
@@ -550,13 +550,13 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
 ; X86-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
 ; X86-AVX512-SLOW:       # %bb.0:
 ; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-SLOW-NEXT:    vpbroadcastd 44(%ecx), %xmm0
+; X86-AVX512-SLOW-NEXT:    vpbroadcastd 44(%eax), %xmm0
 ; X86-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 672(%eax)
+; X86-AVX512-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 672(%ecx)
 ; X86-AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
 ; X86-AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 832(%eax)
+; X86-AVX512-SLOW-NEXT:    vmovdqa %ymm0, 832(%ecx)
 ; X86-AVX512-SLOW-NEXT:    vzeroupper
 ; X86-AVX512-SLOW-NEXT:    retl
 ;
@@ -574,13 +574,13 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
 ; X86-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
 ; X86-AVX512-FAST:       # %bb.0:
 ; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-FAST-NEXT:    vpbroadcastd 44(%ecx), %xmm0
+; X86-AVX512-FAST-NEXT:    vpbroadcastd 44(%eax), %xmm0
 ; X86-AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 672(%eax)
-; X86-AVX512-FAST-NEXT:    vmovdqa 208(%ecx), %xmm0
+; X86-AVX512-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 672(%ecx)
+; X86-AVX512-FAST-NEXT:    vmovdqa 208(%eax), %xmm0
 ; X86-AVX512-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 832(%eax)
+; X86-AVX512-FAST-NEXT:    vmovdqa %ymm0, 832(%ecx)
 ; X86-AVX512-FAST-NEXT:    vzeroupper
 ; X86-AVX512-FAST-NEXT:    retl
 ;
@@ -598,13 +598,13 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
 ; X86-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512F-NEXT:    vpbroadcastd 44(%ecx), %xmm0
+; X86-AVX512F-NEXT:    vpbroadcastd 44(%eax), %xmm0
 ; X86-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512F-NEXT:    vmovdqa %ymm0, 672(%eax)
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512F-NEXT:    vmovdqa %ymm0, 672(%ecx)
 ; X86-AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
 ; X86-AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512F-NEXT:    vmovdqa %ymm0, 832(%eax)
+; X86-AVX512F-NEXT:    vmovdqa %ymm0, 832(%ecx)
 ; X86-AVX512F-NEXT:    vzeroupper
 ; X86-AVX512F-NEXT:    retl
 ;
@@ -655,8 +655,8 @@ define <32 x float> @PR47534(<8 x float> %tmp) {
 define void @PR43170(ptr %a0) {
 ; X86-AVX512-LABEL: PR43170:
 ; X86-AVX512:       # %bb.0: # %entry
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovaps src1, %ymm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovaps %zmm0, (%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -670,8 +670,8 @@ define void @PR43170(ptr %a0) {
 ;
 ; X86-AVX512F-LABEL: PR43170:
 ; X86-AVX512F:       # %bb.0: # %entry
-; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    vmovaps src1, %ymm0
+; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    vmovaps %zmm0, (%eax)
 ; X86-AVX512F-NEXT:    vzeroupper
 ; X86-AVX512F-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index a7313fb35ee17..b6b4d4d1e8035 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -413,12 +413,12 @@ define void @PR39483() {
 ;
 ; X86-AVX512-LABEL: PR39483:
 ; X86-AVX512:       # %bb.0: # %entry
-; X86-AVX512-NEXT:    vmovups 64, %ymm0
-; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7]
-; X86-AVX512-NEXT:    vpermt2ps 0, %zmm1, %zmm0
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; X86-AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [18,21,24,27,30,1,4,7]
+; X86-AVX512-NEXT:    vmovups 64, %ymm1
+; X86-AVX512-NEXT:    vpermt2ps 0, %zmm0, %zmm1
+; X86-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm1
+; X86-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; X86-AVX512-NEXT:    vmovups %ymm0, (%eax)
 ;
 ; X64-AVX1-LABEL: PR39483:
@@ -472,50 +472,50 @@ entry:
 define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr noalias %out0, ptr noalias %out1, ptr noalias %out2) {
 ; X86-AVX1-LABEL: PR48908:
 ; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm4
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[0,1,2,2]
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2]
-; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm5
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3]
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm5
-; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3]
-; X86-AVX1-NEXT:    vmovapd %ymm3, (%edx)
-; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3]
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
-; X86-AVX1-NEXT:    vmovapd %ymm3, (%ecx)
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vmovapd %ymm3, (%eax)
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
+; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3]
+; X86-AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovapd %ymm3, (%eax)
 ; X86-AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; X86-AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vmovapd %ymm0, (%eax)
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: PR48908:
 ; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[0,1],ymm2[0,1]
+; X86-AVX2-NEXT:    vbroadcastsd %xmm1, %ymm4
+; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm0[0,1]
+; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    vbroadcastsd %xmm1, %ymm3
+; X86-AVX2-NEXT:    vmovapd %ymm3, (%eax)
+; X86-AVX2-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3]
 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
-; X86-AVX2-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
-; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
-; X86-AVX2-NEXT:    vmovapd %ymm3, (%edx)
-; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
 ; X86-AVX2-NEXT:    vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
-; X86-AVX2-NEXT:    vmovapd %ymm3, (%ecx)
-; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT:    vmovapd %ymm3, (%eax)
 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
 ; X86-AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovapd %ymm0, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
@@ -525,23 +525,23 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
 ; X86-AVX512-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
 ; X86-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [0,8,2,1]
+; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
+; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
+; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX512-NEXT:    vmovapd %ymm3, (%eax)
 ; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm2, %zmm1, %zmm3
-; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
-; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm4
-; X86-AVX512-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X86-AVX512-NEXT:    vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
-; X86-AVX512-NEXT:    vmovapd %ymm4, (%edx)
 ; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm0, %zmm3, %zmm4
-; X86-AVX512-NEXT:    vmovapd %ymm4, (%ecx)
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovapd %ymm4, (%eax)
 ; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,11,0,0]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm3
 ; X86-AVX512-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
 ; X86-AVX512-NEXT:    vpermi2pd %zmm3, %zmm2, %zmm0
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512-NEXT:    vmovapd %ymm0, (%eax)
 ; X86-AVX512-NEXT:    vzeroupper
 ; X86-AVX512-NEXT:    retl
@@ -695,14 +695,14 @@ define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {
 }
 
 define <8 x i32> @concat_self_v8i32(<4 x i32> %x) {
-; AVX1-LABEL: concat_self_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,1,3]
-; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    ret{{[l|q]}}
+; X86-AVX1-LABEL: concat_self_v8i32:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
+; X86-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,2,1,0]
+; X86-AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX1-NEXT:    retl
 ;
 ; AVX2-LABEL: concat_self_v8i32:
 ; AVX2:       # %bb.0:
@@ -721,6 +721,15 @@ define <8 x i32> @concat_self_v8i32(<4 x i32> %x) {
 ; AVX512-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX1-LABEL: concat_self_v8i32:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,2,1,0]
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,1,3]
+; X64-AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    retq
   %cat = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   %s = shufflevector <8 x i32> %cat, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 0, i32 2, i32 1, i32 3>
   %a = add <8 x i32> %s, %cat
@@ -769,8 +778,8 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    movl %esp, %ebp
 ; X86-AVX2-NEXT:    andl $-32, %esp
 ; X86-AVX2-NEXT:    subl $32, %esp
-; X86-AVX2-NEXT:    vmovaps 8(%ebp), %ymm6
 ; X86-AVX2-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
+; X86-AVX2-NEXT:    vmovaps 8(%ebp), %ymm6
 ; X86-AVX2-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm6[0],ymm2[2],ymm6[2]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3]
 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index de8eea59b2c75..533bbbb18a8d3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -674,12 +674,19 @@ define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1
 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; CHECK-LABEL: combine_pshufb_as_packsswb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsraw $11, %ymm0, %ymm0
-; CHECK-NEXT:    vpsraw $11, %ymm1, %ymm1
-; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: combine_pshufb_as_packsswb:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsraw $11, %ymm1, %ymm1
+; X86-NEXT:    vpsraw $11, %ymm0, %ymm0
+; X86-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_packsswb:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsraw $11, %ymm0, %ymm0
+; X64-NEXT:    vpsraw $11, %ymm1, %ymm1
+; X64-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %1 = ashr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %2 = ashr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %3 = bitcast <16 x i16> %1 to <32 x i8>
@@ -691,12 +698,19 @@ define <32 x i8> @combine_pshufb_as_packsswb(<16 x i16> %a0, <16 x i16> %a1) nou
 }
 
 define <32 x i8> @combine_pshufb_as_packuswb(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; CHECK-LABEL: combine_pshufb_as_packuswb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrlw $11, %ymm0, %ymm0
-; CHECK-NEXT:    vpsrlw $11, %ymm1, %ymm1
-; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: combine_pshufb_as_packuswb:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrlw $11, %ymm1, %ymm1
+; X86-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; X86-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_packuswb:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; X64-NEXT:    vpsrlw $11, %ymm1, %ymm1
+; X64-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %1 = lshr <16 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %2 = lshr <16 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %3 = bitcast <16 x i16> %1 to <32 x i8>
@@ -788,12 +802,19 @@ define <4 x i32> @extract_vpermd(<8 x i32> %a0) {
 
 ; Not beneficial to concatenate both inputs just to create a 256-bit vpaddb
 define <32 x i8> @concat_add_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: concat_add_unnecessary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: concat_add_unnecessary:
+; X86:       # %bb.0:
+; X86-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: concat_add_unnecessary:
+; X64:       # %bb.0:
+; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; X64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %lo = add <16 x i8> %a0, %a1
   %hi = add <16 x i8> %a0, %a2
   %res = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -802,12 +823,19 @@ define <32 x i8> @concat_add_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1, <
 
 ; Not beneficial to concatenate both inputs just to create a 256-bit vpmullw
 define <16 x i16> @concat_mul_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) nounwind {
-; CHECK-LABEL: concat_mul_unnecessary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmullw %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: concat_mul_unnecessary:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
+; X86-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: concat_mul_unnecessary:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmullw %xmm1, %xmm0, %xmm1
+; X64-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %lo = mul <8 x i16> %a0, %a1
   %hi = mul <8 x i16> %a0, %a2
   %res = shufflevector <8 x i16> %lo, <8 x i16> %hi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -816,12 +844,19 @@ define <16 x i16> @concat_mul_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
 
 ; Not beneficial to concatenate both inputs just to create a 256-bit palignr
 define <32 x i8> @concat_alignr_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: concat_alignr_unnecessary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2]
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2]
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: concat_alignr_unnecessary:
+; X86:       # %bb.0:
+; X86-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2]
+; X86-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2]
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: concat_alignr_unnecessary:
+; X64:       # %bb.0:
+; X64-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2]
+; X64-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2]
+; X64-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %lo = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
   %hi = shufflevector <16 x i8> %a2, <16 x i8> %a0, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
   %res = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -830,12 +865,19 @@ define <32 x i8> @concat_alignr_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1
 
 ; Not beneficial to concatenate both inputs just to create a 256-bit packss
 define <32 x i8> @concat_packss_unnecessary(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) nounwind {
-; CHECK-LABEL: concat_packss_unnecessary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: concat_packss_unnecessary:
+; X86:       # %bb.0:
+; X86-NEXT:    vpacksswb %xmm2, %xmm0, %xmm2
+; X86-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: concat_packss_unnecessary:
+; X64:       # %bb.0:
+; X64-NEXT:    vpacksswb %xmm1, %xmm0, %xmm1
+; X64-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %lo = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
   %hi = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a2)
   %res = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -845,12 +887,19 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
 
 ; Not beneficial to concatenate both inputs just to create a 256-bit pshufb
 define <32 x i8> @concat_pshufb_unnecessary(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: concat_pshufb_unnecessary:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
-; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: concat_pshufb_unnecessary:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; X86-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: concat_pshufb_unnecessary:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; X64-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    retq
   %lo = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
   %hi = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a2)
   %res = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -956,26 +1005,47 @@ define <32 x i8> @PR27320(<8 x i32> %a0) {
 }
 
 define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
-; AVX2-LABEL: PR34577:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,7,2,u,u,3,2]
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT:    ret{{[l|q]}}
+; X86-AVX2-LABEL: PR34577:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,7,2,u,u,3,2]
+; X86-AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; X86-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; X86-AVX2-NEXT:    retl
 ;
-; AVX512-LABEL: PR34577:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,18,7,2,20,0,3,2]
-; AVX512-NEXT:    vpermi2ps %zmm2, %zmm1, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    ret{{[l|q]}}
+; X86-AVX512-LABEL: PR34577:
+; X86-AVX512:       # %bb.0: # %entry
+; X86-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X86-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [23,18,7,2,20,0,3,2]
+; X86-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; X86-AVX512-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; X86-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
+; X86-AVX512-NEXT:    vpermi2ps %zmm0, %zmm1, %zmm2
+; X86-AVX512-NEXT:    vmovaps %ymm2, %ymm0
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX2-LABEL: PR34577:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,7,2,u,u,3,2]
+; X64-AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: PR34577:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; X64-AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; X64-AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX512-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
+; X64-AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [23,18,7,2,20,0,3,2]
+; X64-AVX512-NEXT:    vpermi2ps %zmm2, %zmm1, %zmm0
+; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; X64-AVX512-NEXT:    retq
 entry:
   %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
   %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
@@ -1000,17 +1070,17 @@ define <32 x i8> @PR52122(<32 x i8> %0, <32 x i8> %1) {
 define void @PR63030(ptr %p0) {
 ; X86-AVX2-LABEL: PR63030:
 ; X86-AVX2:       # %bb.0:
+; X86-AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [3,0,2,0]
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    vmovaps (%eax), %xmm0
-; X86-AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = [3,0,3,0]
-; X86-AVX2-NEXT:    # xmm1 = mem[0,0]
-; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0]
-; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
-; X86-AVX2-NEXT:    vmovaps {{.*#+}} xmm2 = [3,0,2,0]
-; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
-; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT:    vmovaps (%eax), %xmm1
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,1,1,1]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; X86-AVX2-NEXT:    vmovaps %ymm0, (%eax)
+; X86-AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = [3,0,3,0]
+; X86-AVX2-NEXT:    # xmm0 = mem[0,0]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[1,1,0,0]
+; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
 ; X86-AVX2-NEXT:    vmovaps %ymm0, (%eax)
-; X86-AVX2-NEXT:    vmovaps %ymm1, (%eax)
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -1108,26 +1178,26 @@ define <32 x i16> @PR158415(<8 x i8> %arg) {
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,xmm0[u,u,u,0,2,u,u,u,u,u,u,u,4]
 ; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; X86-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X86-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24],zero,ymm0[25],zero,ymm0[30],zero,ymm0[31],zero,ymm0[u,u,u,u,u,u,u,u]
-; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
-; X86-AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[12,13,14,15],zero,zero,ymm1[4,5,u,u,u,u,u,u,u,u,28,29,30,31],zero,zero,ymm1[20,21],zero,zero,ymm1[26,27,28,29,30,31]
-; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,0,2]
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24],zero,ymm0[25],zero,ymm0[30],zero,ymm0[31],zero,ymm0[u,u,u,u,u,u,u,u]
+; X86-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,2,2,3]
 ; X86-AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
-; X86-AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpbroadcastw %xmm1, %ymm3
-; X86-AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4,5,6,7,8,9],ymm3[10],ymm0[11,12,13,14,15]
+; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,14,15],zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,28,29,30,31],zero,zero,ymm0[20,21],zero,zero,ymm0[26,27,28,29,30,31]
+; X86-AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,2]
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vpbroadcastw %xmm2, %ymm0
+; X86-AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4,5,6,7,8,9],ymm0[10],ymm3[11,12,13,14,15]
 ; X86-AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; X86-AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,xmm2[u,u],zero,zero
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
+; X86-AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero,zero,xmm1[u,u],zero,zero
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
 ; X86-AVX2-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: PR158415:
@@ -1229,20 +1299,35 @@ define <9 x i16> @PR172010(<4 x i64> %a0) {
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,4,5,8,9,0,1],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    ret{{[l|q]}}
 ;
-; AVX512-LABEL: PR172010:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512-NEXT:    vprolq $16, %zmm0, %zmm0
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    ret{{[l|q]}}
+; X86-AVX512-LABEL: PR172010:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X86-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,6,7]
+; X86-AVX512-NEXT:    vpmovqw %zmm1, %xmm1
+; X86-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
+; X86-AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
+; X86-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X86-AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
+; X86-AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; X86-AVX512-NEXT:    vprolq $16, %zmm0, %zmm0
+; X86-AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: PR172010:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,6,7]
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; X64-AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
+; X64-AVX512-NEXT:    vpmovqw %zmm1, %xmm1
+; X64-AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
+; X64-AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
+; X64-AVX512-NEXT:    vprolq $16, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-AVX512-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <16 x i32> <i32 5, i32 3, i32 0, i32 5, i32 2, i32 2, i32 1, i32 1, i32 4, i32 6, i32 5, i32 3, i32 1, i32 7, i32 2, i32 1>
   %trunc = trunc nuw <16 x i64> %shuffle to <16 x i16>
   %result = shufflevector <16 x i16> zeroinitializer, <16 x i16> %trunc, <9 x i32> <i32 31, i32 18, i32 28, i32 20, i32 7, i32 5, i32 8, i32 4, i32 7>
@@ -1252,25 +1337,25 @@ define <9 x i16> @PR172010(<4 x i64> %a0) {
 define <8 x float> @PR173030(i8 %a0, i16 %a1, i32 %a2) {
 ; X86-LABEL: PR173030:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vmovd %ecx, %xmm0
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
-; X86-NEXT:    incb %cl
-; X86-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    # kill: def $al killed $al killed $eax def $eax
+; X86-NEXT:    incb %al
+; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; X86-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovd %ecx, %xmm1
-; X86-NEXT:    incl %ecx
-; X86-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
-; X86-NEXT:    vpmovsxwd %xmm1, %xmm1
-; X86-NEXT:    vmovd %eax, %xmm2
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovd %eax, %xmm1
 ; X86-NEXT:    incl %eax
-; X86-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-NEXT:    vcvtdq2ps %xmm2, %xmm2
+; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; X86-NEXT:    vpmovsxwd %xmm1, %xmm1
 ; X86-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; X86-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovd %eax, %xmm1
+; X86-NEXT:    incl %eax
+; X86-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X86-NEXT:    vcvtdq2ps %xmm1, %xmm1
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
 ; X86-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
@@ -1326,27 +1411,27 @@ define <4 x i16> @PR176951(<32 x i64> %shuffle, <16 x i16> %0, <16 x i1> %cmp) n
 ; X86-AVX2-NEXT:    movl %esp, %ebp
 ; X86-AVX2-NEXT:    andl $-32, %esp
 ; X86-AVX2-NEXT:    subl $32, %esp
-; X86-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; X86-AVX2-NEXT:    vmovaps 104(%ebp), %ymm4
-; X86-AVX2-NEXT:    vmovaps 40(%ebp), %ymm5
+; X86-AVX2-NEXT:    vmovaps 104(%ebp), %ymm3
+; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,2],mem[0,2],ymm3[4,6],mem[4,6]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,1,3]
+; X86-AVX2-NEXT:    vmovaps %ymm3, 96
+; X86-AVX2-NEXT:    vmovaps 40(%ebp), %ymm3
+; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,2],mem[0,2],ymm3[4,6],mem[4,6]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,2,1,3]
+; X86-AVX2-NEXT:    vmovaps %ymm3, 64
+; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],mem[0,2],ymm2[4,6],mem[4,6]
+; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; X86-AVX2-NEXT:    vmovaps %ymm2, 32
 ; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],mem[0,2],ymm2[4,6],mem[4,6]
-; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm5[0,2],mem[0,2],ymm5[4,6],mem[4,6]
-; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; X86-AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2],mem[0,2],ymm4[4,6],mem[4,6]
-; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; X86-AVX2-NEXT:    vmovaps %ymm4, 96
-; X86-AVX2-NEXT:    vmovaps %ymm2, 64
-; X86-AVX2-NEXT:    vmovaps %ymm1, 32
 ; X86-AVX2-NEXT:    vmovaps %ymm0, 0
-; X86-AVX2-NEXT:    vpsllw $15, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X86-AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,8,9,8,9,6,7,u,u,10,11,14,15]
 ; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5],xmm0[6,7]
-; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; X86-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86-AVX2-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
 ; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1361,22 +1446,22 @@ define <4 x i16> @PR176951(<32 x i64> %shuffle, <16 x i16> %0, <16 x i1> %cmp) n
 ; X86-AVX512-NEXT:    movl %esp, %ebp
 ; X86-AVX512-NEXT:    andl $-64, %esp
 ; X86-AVX512-NEXT:    subl $64, %esp
-; X86-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; X86-AVX512-NEXT:    vmovdqa64 8(%ebp), %zmm4
+; X86-AVX512-NEXT:    vpmovqd %zmm2, %ymm2
+; X86-AVX512-NEXT:    vmovdqa64 8(%ebp), %zmm3
+; X86-AVX512-NEXT:    vpmovqd %zmm3, %ymm3
+; X86-AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; X86-AVX512-NEXT:    vmovdqa64 %zmm2, 64
 ; X86-AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; X86-AVX512-NEXT:    vpmovqd %zmm1, %ymm1
 ; X86-AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X86-AVX512-NEXT:    vpmovqd %zmm2, %ymm1
-; X86-AVX512-NEXT:    vpmovqd %zmm4, %ymm2
-; X86-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; X86-AVX512-NEXT:    vmovdqa64 %zmm1, 64
 ; X86-AVX512-NEXT:    vmovdqa64 %zmm0, 0
-; X86-AVX512-NEXT:    vpsllw $15, %xmm3, %xmm0
+; X86-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X86-AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,u,u,8,9,8,9,6,7,u,u,10,11,14,15]
 ; X86-AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5],xmm0[6,7]
-; X86-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; X86-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; X86-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
 ; X86-AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 376fe37ef1fa8..4530645963b77 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -52,10 +52,10 @@ define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
 define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
 ; X86-LABEL: combine_pshufb_identity_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; X86-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; X86-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
 ; X86-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
 ; X86-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
@@ -100,8 +100,8 @@ define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
 define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
 ; X86-LABEL: combine_pshufb_as_pslldq_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
@@ -126,8 +126,8 @@ define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
 define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
 ; X86-LABEL: combine_pshufb_as_psrldq_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
@@ -156,9 +156,9 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d(<8 x i64> %a0, <8 x i64> %a1
 define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64> %a1, i64 %m) {
 ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,0,5,0,0,12,0,14]
 ; X86-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63]
 ; X86-NEXT:    retl
 ;
@@ -194,12 +194,19 @@ define <32 x i16> @combine_permvar_as_pshufhw(<32 x i16> %a0) {
 }
 
 define <32 x i16> @combine_vpermi2var_as_packssdw(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; CHECK-LABEL: combine_vpermi2var_as_packssdw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrad $25, %zmm0, %zmm0
-; CHECK-NEXT:    vpsrad $25, %zmm1, %zmm1
-; CHECK-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: combine_vpermi2var_as_packssdw:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrad $25, %zmm1, %zmm1
+; X86-NEXT:    vpsrad $25, %zmm0, %zmm0
+; X86-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_vpermi2var_as_packssdw:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrad $25, %zmm0, %zmm0
+; X64-NEXT:    vpsrad $25, %zmm1, %zmm1
+; X64-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; X64-NEXT:    retq
   %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
   %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
   %3 = bitcast <16 x i32> %1 to <32 x i16>
@@ -209,12 +216,19 @@ define <32 x i16> @combine_vpermi2var_as_packssdw(<16 x i32> %a0, <16 x i32> %a1
 }
 
 define <32 x i16> @combine_vpermi2var_as_packusdw(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; CHECK-LABEL: combine_vpermi2var_as_packusdw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrld $25, %zmm0, %zmm0
-; CHECK-NEXT:    vpsrld $25, %zmm1, %zmm1
-; CHECK-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: combine_vpermi2var_as_packusdw:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrld $25, %zmm1, %zmm1
+; X86-NEXT:    vpsrld $25, %zmm0, %zmm0
+; X86-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_vpermi2var_as_packusdw:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrld $25, %zmm0, %zmm0
+; X64-NEXT:    vpsrld $25, %zmm1, %zmm1
+; X64-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; X64-NEXT:    retq
   %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
   %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
   %3 = bitcast <16 x i32> %1 to <32 x i16>
@@ -224,12 +238,19 @@ define <32 x i16> @combine_vpermi2var_as_packusdw(<16 x i32> %a0, <16 x i32> %a1
 }
 
 define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
-; CHECK-LABEL: combine_pshufb_as_packsswb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsraw $11, %zmm0, %zmm0
-; CHECK-NEXT:    vpsraw $11, %zmm1, %zmm1
-; CHECK-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: combine_pshufb_as_packsswb:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsraw $11, %zmm1, %zmm1
+; X86-NEXT:    vpsraw $11, %zmm0, %zmm0
+; X86-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_packsswb:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsraw $11, %zmm0, %zmm0
+; X64-NEXT:    vpsraw $11, %zmm1, %zmm1
+; X64-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; X64-NEXT:    retq
   %1 = ashr <32 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %2 = ashr <32 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %3 = bitcast <32 x i16> %1 to <64 x i8>
@@ -241,12 +262,19 @@ define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nou
 }
 
 define <64 x i8> @combine_pshufb_as_packuswb(<32 x i16> %a0, <32 x i16> %a1) nounwind {
-; CHECK-LABEL: combine_pshufb_as_packuswb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; CHECK-NEXT:    vpsrlw $11, %zmm1, %zmm1
-; CHECK-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: combine_pshufb_as_packuswb:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrlw $11, %zmm1, %zmm1
+; X86-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; X86-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_packuswb:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlw $11, %zmm0, %zmm0
+; X64-NEXT:    vpsrlw $11, %zmm1, %zmm1
+; X64-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; X64-NEXT:    retq
   %1 = lshr <32 x i16> %a0, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %2 = lshr <32 x i16> %a1, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %3 = bitcast <32 x i16> %1 to <64 x i8>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index d33507e1a5bb9..8c2e8533d9a08 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -169,13 +169,21 @@ define <8 x i32> @concat_vrotli_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <8 x i32> @concat_vrotlv_v4i32(<4 x i32> %a0, <4 x i32> %a1, <8 x i32> %a2) {
-; CHECK-LABEL: concat_vrotlv_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; CHECK-NEXT:    vprolvd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vprolvd %xmm3, %xmm1, %xmm1
-; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: concat_vrotlv_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; X86-NEXT:    vprolvd %xmm3, %xmm1, %xmm1
+; X86-NEXT:    vprolvd %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: concat_vrotlv_v4i32:
+; X64:       # %bb.0:
+; X64-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; X64-NEXT:    vprolvd %xmm2, %xmm0, %xmm0
+; X64-NEXT:    vprolvd %xmm3, %xmm1, %xmm1
+; X64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    retq
   %lo = shufflevector <8 x i32> %a2, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %hi = shufflevector <8 x i32> %a2, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r0 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a0, <4 x i32> %a0, <4 x i32> %lo)
@@ -199,15 +207,15 @@ define <8 x i16> @demandedelts_vpermvar_32i16_v8i16(<32 x i16> %x0) {
 define void @PR46178(ptr %0) {
 ; X86-LABEL: PR46178:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu 0, %ymm0
-; X86-NEXT:    vmovdqu (%eax), %ymm1
 ; X86-NEXT:    vpmovqw %ymm0, %xmm0
+; X86-NEXT:    vmovdqu (%eax), %ymm1
 ; X86-NEXT:    vpmovqw %ymm1, %xmm1
 ; X86-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; X86-NEXT:    vpsllw $8, %ymm0, %ymm0
 ; X86-NEXT:    vpsraw $8, %ymm0, %ymm0
 ; X86-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index 2cc65b111ace2..c7c45c4770854 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -36,9 +36,9 @@ define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double
 define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
 ; X86-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
 ; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X86-AVX512F-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
@@ -47,9 +47,9 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d
 ;
 ; X86-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
 ; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X86-AVX512BW-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
@@ -95,9 +95,9 @@ define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
 define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
 ; X86-AVX512F-LABEL: combine_permvar_8i64_identity_mask:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
 ; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X86-AVX512F-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
@@ -106,9 +106,9 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x
 ;
 ; X86-AVX512BW-LABEL: combine_permvar_8i64_identity_mask:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
 ; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
 ; X86-AVX512BW-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
@@ -154,18 +154,18 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x dou
 define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
 ; X86-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    retl
@@ -254,18 +254,18 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1)
 define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
 ; X86-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
 ; X86-AVX512BW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    retl
@@ -1097,17 +1097,17 @@ define <16 x i32> @combine_vcompressd_splat(i16 %m) {
 define <8 x i64> @PR179008(ptr %p0) {
 ; X86-AVX512F-LABEL: PR179008:
 ; X86-AVX512F:       # %bb.0:
+; X86-AVX512F-NEXT:    movb $31, %al
+; X86-AVX512F-NEXT:    kmovw %eax, %k1
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    movb $31, %cl
-; X86-AVX512F-NEXT:    kmovw %ecx, %k1
 ; X86-AVX512F-NEXT:    vmovdqu64 (%eax), %zmm0 {%k1} {z}
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X86-AVX512BW-LABEL: PR179008:
 ; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    movb $31, %al
+; X86-AVX512BW-NEXT:    kmovd %eax, %k1
 ; X86-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512BW-NEXT:    movb $31, %cl
-; X86-AVX512BW-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512BW-NEXT:    vmovdqu64 (%eax), %zmm0 {%k1} {z}
 ; X86-AVX512BW-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 7d6ca16313583..68ab1116d85b7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -126,8 +126,8 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d(<8 x i64> %a0, <8 x i64> %a1
 define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64> %a1, i64 %m) {
 ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [56,57,58,59,56,57,58,59,56,57,58,59,56,57,58,59,44,45,46,47,44,45,46,47,44,45,46,47,44,45,46,47,96,97,98,99,96,97,98,99,96,97,98,99,96,97,98,99,116,117,118,119,116,117,118,119,116,117,118,119,116,117,118,119]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpermi2b %zmm0, %zmm1, %zmm2 {%k1} {z}
 ; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; X86-NEXT:    retl
@@ -174,8 +174,8 @@ define <64 x i8> @combine_vpermi2var_v64i8_with_mask_commute(<64 x i8> %a0, <64
 define <64 x i8> @combine_vpermi2var_constant_v64i8_with_mask(<64 x i8> %a0) {
 ; X86-LABEL: combine_vpermi2var_constant_v64i8_with_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
 ; X86-NEXT:    vpmovb2m %zmm0, %k1
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
 ; X86-NEXT:    vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
@@ -193,9 +193,9 @@ define <64 x i8> @combine_vpermi2var_constant_v64i8_with_mask(<64 x i8> %a0) {
 define <64 x i8> @combine_vpermi2var_constant_v64i8_with_mask_commute(<64 x i8> %a0) {
 ; X86-LABEL: combine_vpermi2var_constant_v64i8_with_mask_commute:
 ; X86:       # %bb.0:
-; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
 ; X86-NEXT:    vpmovb2m %zmm0, %k0
 ; X86-NEXT:    knotq %k0, %k1
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
 ; X86-NEXT:    vpermi2b {{\.?LCPI[0-9]+_[0-9]+}}, %zmm1, %zmm0 {%k1} {z}
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
index d5aa7588925d8..b1d2da4d8b8fb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
@@ -49,36 +49,34 @@ define <4 x double> @concat_vpermv3_ops_vpermv_swap_v4f64(ptr %p0, <4 x i64> %m)
 define void @PR142995(ptr %p0, ptr %p1, ptr %p2) nounwind #0 {
 ; X86-LABEL: PR142995:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movw $1031, %ax # imm = 0x407
+; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb $17, %bl
-; X86-NEXT:    kmovw %ebx, %k1
-; X86-NEXT:    vmovdqu32 (%edx), %ymm0 {%k1} {z}
+; X86-NEXT:    vmovdqu32 (%eax), %zmm0 {%k1} {z}
 ; X86-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    movw $6144, %dx # imm = 0x1800
-; X86-NEXT:    kmovw %edx, %k1
-; X86-NEXT:    vmovdqu32 128(%ecx), %zmm2 {%k1} {z}
-; X86-NEXT:    movw $1031, %dx # imm = 0x407
-; X86-NEXT:    kmovw %edx, %k1
-; X86-NEXT:    vmovdqu32 (%ecx), %zmm3 {%k1} {z}
-; X86-NEXT:    vpbroadcastd 252(%ecx), %zmm4
-; X86-NEXT:    vpbroadcastd %xmm1, %xmm5
-; X86-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero
-; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT:    vpunpckldq {{.*#+}} zmm2 = zmm4[0],zmm2[0],zmm4[1],zmm2[1],zmm4[4],zmm2[4],zmm4[5],zmm2[5],zmm4[8],zmm2[8],zmm4[9],zmm2[9],zmm4[12],zmm2[12],zmm4[13],zmm2[13]
-; X86-NEXT:    vextracti32x4 $3, %zmm2, %xmm2
-; X86-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; X86-NEXT:    vpaddd %xmm1, %xmm5, %xmm1
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movw $6144, %cx # imm = 0x1800
+; X86-NEXT:    kmovw %ecx, %k1
+; X86-NEXT:    vmovdqu32 128(%eax), %zmm3 {%k1} {z}
+; X86-NEXT:    vpbroadcastd 252(%eax), %zmm0
+; X86-NEXT:    vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm3[0],zmm0[1],zmm3[1],zmm0[4],zmm3[4],zmm0[5],zmm3[5],zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[12],zmm3[12],zmm0[13],zmm3[13]
+; X86-NEXT:    vextracti32x4 $3, %zmm3, %xmm3
+; X86-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; X86-NEXT:    vpbroadcastd %xmm1, %xmm1
+; X86-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero
+; X86-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; X86-NEXT:    movb $17, %al
+; X86-NEXT:    kmovw %eax, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu32 (%eax), %ymm2 {%k1} {z}
 ; X86-NEXT:    vmovdqu %xmm1, (%eax)
-; X86-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,0,10,0,4,4,14,0]
-; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1
-; X86-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7]
-; X86-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7]
+; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [4,0,10,0,4,4,14,0]
+; X86-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
+; X86-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4,5,6,7]
+; X86-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovdqu %ymm0, (%eax)
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
 ;
@@ -142,11 +140,11 @@ define <8 x double> @PR143606(ptr %px, ptr %py) {
 ; X86-LABEL: PR143606:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovapd (%ecx), %ymm0
-; X86-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
-; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
-; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-NEXT:    vmovapd (%eax), %ymm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vshufpd {{.*#+}} ymm1 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X86-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3]
+; X86-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR143606:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index ee4a7e1413f6c..d17a3222e670b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -298,8 +298,8 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 define void @buildvector_v4f32_0404(float %a, float %b, ptr %ptr) {
 ; X86-LABEL: buildvector_v4f32_0404:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -326,9 +326,9 @@ define void @buildvector_v4f32_0404(float %a, float %b, ptr %ptr) {
 define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, ptr %ptr) {
 ; X86-LABEL: buildvector_v4f32_07z6:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
index 60800673ed2dd..6049912ccd479 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -32,10 +32,10 @@ define void @test1() nounwind {
 ; X86-LABEL: test1:
 ; X86:       ## %bb.0: ## %entry
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm1 ## mm1 = 0x7070606040400000
+; X86-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x7070606040400000
+; X86-NEXT:    pxor %mm1, %mm1
 ; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    maskmovq %mm1, %mm0
+; X86-NEXT:    maskmovq %mm0, %mm1
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
@@ -92,12 +92,12 @@ define <4 x float> @pr35869() nounwind {
 ; X86-NEXT:    punpcklbw %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
 ; X86-NEXT:    pcmpgtw %mm0, %mm1
 ; X86-NEXT:    movq %mm0, %mm2
-; X86-NEXT:    punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3]
+; X86-NEXT:    punpcklwd %mm1, %mm2 ## mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X86-NEXT:    punpckhwd %mm1, %mm0 ## mm0 = mm0[2],mm1[2],mm0[3],mm1[3]
 ; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    cvtpi2ps %mm2, %xmm0
-; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
-; X86-NEXT:    punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 ; X86-NEXT:    cvtpi2ps %mm0, %xmm0
+; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X86-NEXT:    cvtpi2ps %mm2, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pr35869:
diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
index 9f2f1d57c2dbc..f7b3b0bcf6637 100644
--- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
+++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
@@ -9,56 +9,34 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-64, %esp
 ; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl 456(%ebp), %esi
-; X86-NEXT:    vmovdqa64 328(%ebp), %zmm3
-; X86-NEXT:    vmovdqa64 200(%ebp), %zmm4
-; X86-NEXT:    vmovdqa64 72(%ebp), %zmm5
 ; X86-NEXT:    vp2intersectd %zmm1, %zmm0, %k0
-; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; X86-NEXT:    kmovw %k0, %ecx
+; X86-NEXT:    kmovw %k1, %eax
+; X86-NEXT:    vmovdqa64 328(%ebp), %zmm0
+; X86-NEXT:    vp2intersectd 392(%ebp), %zmm0, %k0
+; X86-NEXT:    kmovw %k0, %edx
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    vp2intersectd 8(%ebp), %zmm2, %k0
-; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    vp2intersectd 136(%ebp), %zmm5, %k0
-; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    vp2intersectd 264(%ebp), %zmm4, %k0
-; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    vp2intersectd 392(%ebp), %zmm3, %k0
-; X86-NEXT:    kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; X86-NEXT:    vzeroupper
-; X86-NEXT:    calll dummy at PLT
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
-; X86-NEXT:    kmovw %k0, %edx
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
-; X86-NEXT:    kmovw %k0, %ecx
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
-; X86-NEXT:    kmovw %k0, %edi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload
-; X86-NEXT:    kmovw %k2, %edi
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    kmovw %k1, %edx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    leal -8(%ebp), %esp
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    vmovdqa64 200(%ebp), %zmm0
+; X86-NEXT:    vp2intersectd 264(%ebp), %zmm0, %k0
+; X86-NEXT:    kmovw %k0, %ecx
+; X86-NEXT:    vmovdqa64 72(%ebp), %zmm0
+; X86-NEXT:    vp2intersectd 136(%ebp), %zmm0, %k0
+; X86-NEXT:    kmovw %k0, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    calll dummy at PLT
+; X86-NEXT:    movl 456(%ebp), %eax
+; X86-NEXT:    movw %si, (%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-1.ll b/llvm/test/CodeGen/X86/vshift-1.ll
index d481690d9f2b8..b65b2ad65771f 100644
--- a/llvm/test/CodeGen/X86/vshift-1.ll
+++ b/llvm/test/CodeGen/X86/vshift-1.ll
@@ -8,8 +8,8 @@
 define void @shift1a(<2 x i64> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift1a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psllq $32, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -27,9 +27,9 @@ entry:
 define void @shift1b(<2 x i64> %val, ptr %dst, i64 %amt) nounwind {
 ; X86-LABEL: shift1b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psllq %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -51,8 +51,8 @@ entry:
 define void @shift2a(<4 x i32> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift2a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pslld $5, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -70,9 +70,9 @@ entry:
 define void @shift2b(<4 x i32> %val, ptr %dst, i32 %amt) nounwind {
 ; X86-LABEL: shift2b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -95,8 +95,8 @@ entry:
 define void @shift3a(<8 x i16> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift3a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psllw $5, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -115,10 +115,10 @@ entry:
 define void @shift3b(<8 x i16> %val, ptr %dst, i16 %amt) nounwind {
 ; X86-LABEL: shift3b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movd %ecx, %xmm1
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psllw %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-2.ll b/llvm/test/CodeGen/X86/vshift-2.ll
index d4f8b02cd0c52..ce0ca9d4a84ef 100644
--- a/llvm/test/CodeGen/X86/vshift-2.ll
+++ b/llvm/test/CodeGen/X86/vshift-2.ll
@@ -8,8 +8,8 @@
 define void @shift1a(<2 x i64> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift1a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrlq $32, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -27,9 +27,9 @@ entry:
 define void @shift1b(<2 x i64> %val, ptr %dst, i64 %amt) nounwind {
 ; X86-LABEL: shift1b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psrlq %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -50,8 +50,8 @@ entry:
 define void @shift2a(<4 x i32> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift2a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrld $17, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -69,9 +69,9 @@ entry:
 define void @shift2b(<4 x i32> %val, ptr %dst, i32 %amt) nounwind {
 ; X86-LABEL: shift2b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    psrld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -95,8 +95,8 @@ entry:
 define void @shift3a(<8 x i16> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift3a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrlw $5, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -115,10 +115,10 @@ entry:
 define void @shift3b(<8 x i16> %val, ptr %dst, i16 %amt) nounwind {
 ; X86-LABEL: shift3b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movd %ecx, %xmm1
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psrlw %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-3.ll b/llvm/test/CodeGen/X86/vshift-3.ll
index 8d472f00b488e..2b86a54be1220 100644
--- a/llvm/test/CodeGen/X86/vshift-3.ll
+++ b/llvm/test/CodeGen/X86/vshift-3.ll
@@ -10,11 +10,11 @@
 define void @shift1a(<2 x i64> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift1a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrad $31, %xmm1
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -35,8 +35,8 @@ entry:
 define void @shift2a(<4 x i32> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift2a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrad $5, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -54,9 +54,9 @@ entry:
 define void @shift2b(<4 x i32> %val, ptr %dst, i32 %amt) nounwind {
 ; X86-LABEL: shift2b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    psrad %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -79,8 +79,8 @@ entry:
 define void @shift3a(<8 x i16> %val, ptr %dst) nounwind {
 ; X86-LABEL: shift3a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psraw $5, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -98,10 +98,10 @@ entry:
 define void @shift3b(<8 x i16> %val, ptr %dst, i16 %amt) nounwind {
 ; X86-LABEL: shift3b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movd %ecx, %xmm1
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psraw %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-4.ll b/llvm/test/CodeGen/X86/vshift-4.ll
index 9af761c5dc15a..9d0b6775a6bd9 100644
--- a/llvm/test/CodeGen/X86/vshift-4.ll
+++ b/llvm/test/CodeGen/X86/vshift-4.ll
@@ -8,8 +8,8 @@
 define void @shift1a(<2 x i64> %val, ptr %dst, <2 x i64> %sh) nounwind {
 ; X86-LABEL: shift1a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psllq %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -29,12 +29,12 @@ entry:
 define void @shift1b(<2 x i64> %val, ptr %dst, <2 x i64> %sh) nounwind {
 ; X86-LABEL: shift1b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, %xmm2
 ; X86-NEXT:    psllq %xmm1, %xmm2
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; X86-NEXT:    psllq %xmm1, %xmm0
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movapd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -57,9 +57,9 @@ entry:
 define void @shift2a(<4 x i32> %val, ptr %dst, <2 x i32> %amt) nounwind {
 ; X86-LABEL: shift2a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrlq $32, %xmm1
 ; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -79,9 +79,9 @@ entry:
 define void @shift2b(<4 x i32> %val, ptr %dst, <2 x i32> %amt) nounwind {
 ; X86-LABEL: shift2b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrlq $32, %xmm1
 ; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -101,9 +101,9 @@ entry:
 define void @shift2c(<4 x i32> %val, ptr %dst, <2 x i32> %amt) nounwind {
 ; X86-LABEL: shift2c:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    psrlq $32, %xmm1
 ; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -123,10 +123,10 @@ entry:
 define void @shift3a(<8 x i16> %val, ptr %dst, <8 x i16> %amt) nounwind {
 ; X86-LABEL: shift3a:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
 ; X86-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X86-NEXT:    psllw %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -147,10 +147,10 @@ entry:
 define void @shift3b(<8 x i16> %val, ptr %dst, i16 %amt) nounwind {
 ; X86-LABEL: shift3b:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movd %ecx, %xmm1
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psllw %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-5.ll b/llvm/test/CodeGen/X86/vshift-5.ll
index e3f695453b5b9..00b1e3e8d3e4e 100644
--- a/llvm/test/CodeGen/X86/vshift-5.ll
+++ b/llvm/test/CodeGen/X86/vshift-5.ll
@@ -8,9 +8,9 @@ define void @shift5a(<4 x i32> %val, ptr %dst, ptr %pamt) nounwind {
 ; X86-LABEL: shift5a:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -34,9 +34,9 @@ define void @shift5b(<4 x i32> %val, ptr %dst, ptr %pamt) nounwind {
 ; X86-LABEL: shift5b:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    psrad %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -59,9 +59,9 @@ entry:
 define void @shift5c(<4 x i32> %val, ptr %dst, i32 %amt) nounwind {
 ; X86-LABEL: shift5c:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    pslld %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -83,9 +83,9 @@ entry:
 define void @shift5d(<4 x i32> %val, ptr %dst, i32 %amt) nounwind {
 ; X86-LABEL: shift5d:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    psrad %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll
index 912ff750d9e91..7070d8e68610e 100644
--- a/llvm/test/CodeGen/X86/vshift-6.ll
+++ b/llvm/test/CodeGen/X86/vshift-6.ll
@@ -32,12 +32,12 @@ define <16 x i8> @do_not_crash(ptr, ptr, ptr, i32, i64, i8) {
 ; X86-NEXT:    movb %al, (%ecx)
 ; X86-NEXT:    movd %eax, %xmm1
 ; X86-NEXT:    psllq $56, %xmm1
-; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
 ; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
 ; X86-NEXT:    pxor %xmm0, %xmm0
 ; X86-NEXT:    pcmpgtb %xmm1, %xmm0
+; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    pxor %xmm0, %xmm3
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-NEXT:    por %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/vshift_split2.ll b/llvm/test/CodeGen/X86/vshift_split2.ll
index 8ac97afb051f4..e79b36f14abcd 100644
--- a/llvm/test/CodeGen/X86/vshift_split2.ll
+++ b/llvm/test/CodeGen/X86/vshift_split2.ll
@@ -6,10 +6,10 @@
 define void @update(<8 x i32> %val, ptr %dst) nounwind {
 ; CHECK-LABEL: update:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    psrad $2, %xmm0
 ; CHECK-NEXT:    psrad $4, %xmm1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movdqa %xmm1, 16(%eax)
+; CHECK-NEXT:    psrad $2, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, (%eax)
 ; CHECK-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index e9ddc576c6cd8..f3c58b5ee325a 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -53,24 +53,24 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-LABEL: lshr_4bytes:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-LABEL: lshr_4bytes:
 ; X86-HAVE-BMI2:       # %bb.0:
 ; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-HAVE-BMI2-NEXT:    shlb $3, %dl
-; X86-HAVE-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X86-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %byteOff = load i32, ptr %byteOff.ptr, align 1
@@ -100,24 +100,24 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-LABEL: shl_4bytes:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-NO-BMI2-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-LABEL: shl_4bytes:
 ; X86-HAVE-BMI2:       # %bb.0:
 ; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NEXT:    shlxl %eax, (%ecx), %eax
 ; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-HAVE-BMI2-NEXT:    shlb $3, %dl
-; X86-HAVE-BMI2-NEXT:    shlxl %edx, (%ecx), %ecx
-; X86-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %byteOff = load i32, ptr %byteOff.ptr, align 1
@@ -147,24 +147,24 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-LABEL: ashr_4bytes:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-NO-BMI2-NEXT:    sarl %cl, %edx
-; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NEXT:    sarl %cl, %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-LABEL: ashr_4bytes:
 ; X86-HAVE-BMI2:       # %bb.0:
 ; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NEXT:    sarxl %eax, (%ecx), %eax
 ; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-HAVE-BMI2-NEXT:    shlb $3, %dl
-; X86-HAVE-BMI2-NEXT:    sarxl %edx, (%ecx), %ecx
-; X86-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-HAVE-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %byteOff = load i32, ptr %byteOff.ptr, align 1
@@ -194,109 +194,100 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-SHLD-NO-BMI2-LABEL: lshr_8bytes:
 ; X86-NO-SHLD-NO-BMI2:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%eax), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%eax), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%edx,%edx), %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    notb %cl
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %edi, %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
-; X86-NO-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-SHLD-NO-BMI2-LABEL: lshr_8bytes:
 ; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esi), %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esi), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrl %cl, %edx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    xorl %esi, %esi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%ecx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%ecx)
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-NO-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
 ; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ecx,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    xorl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%eax)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
 ; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esi), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%ecx)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
@@ -325,110 +316,102 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-SHLD-NO-BMI2-LABEL: shl_8bytes:
 ; X86-NO-SHLD-NO-BMI2:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%eax), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%eax), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    notb %cl
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %edi, %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
-; X86-NO-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, (%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%eax)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-SHLD-NO-BMI2-LABEL: shl_8bytes:
 ; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    xorl %esi, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shldl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    xorl %edx, %edx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%ecx)
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-NO-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
 ; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, 4(%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, 4(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %cl
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrl %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    xorl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %al
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%eax)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
 ; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shldl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%ecx)
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
@@ -457,110 +440,102 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-SHLD-NO-BMI2-LABEL: ashr_8bytes:
 ; X86-NO-SHLD-NO-BMI2:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%eax), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%eax), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%edx,%edx), %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    notb %cl
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %edi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
-; X86-NO-SHLD-NO-BMI2-NEXT:    cmovnel %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %edi, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%eax)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_8bytes:
 ; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esi), %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esi), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %edx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%ecx)
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
 ; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %dl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %edx, (%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ecx,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %eax, %ecx, %edx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %dl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%eax)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
 ;
 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
 ; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esi), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%ecx)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
@@ -652,58 +627,60 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $60, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb (%eax), %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %al
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $44, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %ah, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%ebx), %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, (%esi)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%edx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -712,47 +689,45 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes:
 ; X86-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb (%ecx), %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $32, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %ch, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%ebx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%ebx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%esi), %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebp, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $32, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retl
 ;
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes:
@@ -762,46 +737,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $44, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $12, %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %bl, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $12, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %dl, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 12(%edx)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%esp,%edi), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esp,%edi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, (%esi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, (%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%edx)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
@@ -811,48 +786,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes:
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $32, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $12, %al
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %al, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%esp,%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp,%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esp,%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esp,%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %eax, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 8(%ebx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 12(%ebx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, (%ebx)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%ebx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $32, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    retl
 ;
 ; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes:
@@ -862,53 +835,57 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %al
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $12, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 4(%ecx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 8(%esi)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, (%esi)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $60, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
@@ -923,32 +900,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %dl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%esp,%ebx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp,%ebx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%esp,%esi), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%esp,%esi), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%ebp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
@@ -963,41 +939,44 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $60, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %al
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $12, %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %dl, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %dl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, 16(%esp,%esi), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%esi)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%edi)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $60, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
@@ -1012,33 +991,32 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %dl, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 12(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 8(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %al, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 4(%esp,%esi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 12(%esp,%esi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 12(%ebp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%ebp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
@@ -1053,53 +1031,57 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %al
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $12, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%edi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%esi,%esi), %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 4(%ecx)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 8(%esi)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 8(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, (%esi)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $60, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
@@ -1114,32 +1096,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %dl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 12(%esp,%ebx), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 8(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%esp,%ebx), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 4(%esp,%esi), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 12(%esp,%esi), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%ebp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
@@ -1154,41 +1135,44 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $60, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %edx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %al
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $12, %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %dl, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, 16(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %dl, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%edi), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%edi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, %ebx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebp, %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebp, 16(%esp,%esi), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 4(%esi)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%edi)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $60, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
@@ -1203,33 +1187,32 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %dl, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 12(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 8(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 4(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 8(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 4(%esp,%esi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 12(%esp,%esi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 12(%ebp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%ebp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
@@ -1322,52 +1305,46 @@ define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: lshr_16bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    subl $44, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, (%esp)
-; X86-SSE2-NEXT:    andl $3, %ecx
-; X86-SSE2-NEXT:    movl (%esp,%ecx,4), %edx
-; X86-SSE2-NEXT:    movl 4(%esp,%ecx,4), %esi
-; X86-SSE2-NEXT:    movl 12(%esp,%ecx,4), %edi
-; X86-SSE2-NEXT:    movl 8(%esp,%ecx,4), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %ecx, 8(%edx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl %ecx, 12(%edx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-SSE2-NEXT:    movl %ecx, (%edx)
+; X86-SSE2-NEXT:    movl %eax, 4(%edx)
+; X86-SSE2-NEXT:    addl $44, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: lshr_16bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $44, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
 ; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $3, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    andl $3, %eax
+; X86-SSE42-NEXT:    movups (%esp,%eax,4), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $44, %esp
 ; X86-SSE42-NEXT:    retl
@@ -1375,16 +1352,16 @@ define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X86-AVX-LABEL: lshr_16bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    subl $44, %esp
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovups (%eax), %xmm0
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
-; X86-AVX-NEXT:    andl $3, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    andl $3, %eax
+; X86-AVX-NEXT:    vmovups (%esp,%eax,4), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $44, %esp
 ; X86-AVX-NEXT:    retl
@@ -1480,59 +1457,64 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $60, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb (%eax), %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %dh
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %dh
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    negb %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %ah, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    negb %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%ebp), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 4(%edi)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $60, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
@@ -1542,45 +1524,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes:
 ; X86-HAVE-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $32, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb (%ecx), %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $44, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negb %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %ch, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%edi), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negb %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %al, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%edi), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%edi), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%edi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%edi), %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, (%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $32, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 12(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    retl
 ;
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes:
@@ -1589,49 +1575,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $44, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $60, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $12, %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %bl, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $12, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebx, %edx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, (%ecx)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %ebx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, 28(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebx, 44(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 12(%esi)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %edx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $60, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -1645,37 +1635,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $12, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %al
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %al, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edi, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%esi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%esi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%ebp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
@@ -1689,56 +1681,57 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $44, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %al
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $12, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    negb %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movsbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 4(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%edi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%edx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 8(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 12(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -1752,32 +1745,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negb %dl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movsbl %dl, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negb %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
@@ -1792,42 +1786,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %al
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, (%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $12, %dl
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %dl, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %esi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %dl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, 28(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%edx)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
@@ -1842,32 +1835,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %dl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %dl, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %ebx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
@@ -1881,56 +1875,57 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $44, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %al
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $12, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    negb %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movsbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 4(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%edi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%edi), %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%edx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 8(%edx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 12(%ebx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -1944,32 +1939,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    negb %dl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movsbl %dl, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%edi), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    negb %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%esi), %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%esi), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
@@ -1984,42 +1980,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %edx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %al
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, (%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $12, %dl
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    negb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %dl, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, 28(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %esi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%esp,%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %dl, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, 28(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%edx)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
@@ -2034,32 +2029,33 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $12, %dl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    negb %dl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %dl, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%edi), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %ebx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    negb %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $44, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
@@ -2153,58 +2149,52 @@ define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ;
 ; X86-SSE2-LABEL: shl_16bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
+; X86-SSE2-NEXT:    subl $44, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    shlb $2, %cl
-; X86-SSE2-NEXT:    andb $12, %cl
-; X86-SSE2-NEXT:    negb %cl
-; X86-SSE2-NEXT:    movsbl %cl, %ecx
-; X86-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
-; X86-SSE2-NEXT:    movl 20(%esp,%ecx), %esi
-; X86-SSE2-NEXT:    movl 28(%esp,%ecx), %edi
-; X86-SSE2-NEXT:    movl 24(%esp,%ecx), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    shlb $2, %al
+; X86-SSE2-NEXT:    andb $12, %al
+; X86-SSE2-NEXT:    negb %al
+; X86-SSE2-NEXT:    movsbl %al, %eax
+; X86-SSE2-NEXT:    movl 24(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %ecx, 8(%edx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, 12(%edx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax), %ecx
+; X86-SSE2-NEXT:    movl 20(%esp,%eax), %eax
+; X86-SSE2-NEXT:    movl %ecx, (%edx)
+; X86-SSE2-NEXT:    movl %eax, 4(%edx)
+; X86-SSE2-NEXT:    addl $44, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: shl_16bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $44, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
-; X86-SSE42-NEXT:    movaps %xmm1, (%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
 ; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    shlb $2, %cl
-; X86-SSE42-NEXT:    andb $12, %cl
-; X86-SSE42-NEXT:    negb %cl
-; X86-SSE42-NEXT:    movsbl %cl, %ecx
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    shlb $2, %al
+; X86-SSE42-NEXT:    andb $12, %al
+; X86-SSE42-NEXT:    negb %al
+; X86-SSE42-NEXT:    movsbl %al, %eax
+; X86-SSE42-NEXT:    movups 16(%esp,%eax), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $44, %esp
 ; X86-SSE42-NEXT:    retl
@@ -2212,19 +2202,19 @@ define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X86-AVX-LABEL: shl_16bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    subl $44, %esp
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovaps %xmm1, (%esp)
+; X86-AVX-NEXT:    vmovups (%eax), %xmm0
 ; X86-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    shlb $2, %cl
-; X86-AVX-NEXT:    andb $12, %cl
-; X86-AVX-NEXT:    negb %cl
-; X86-AVX-NEXT:    movsbl %cl, %ecx
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    shlb $2, %al
+; X86-AVX-NEXT:    andb $12, %al
+; X86-AVX-NEXT:    negb %al
+; X86-AVX-NEXT:    movsbl %al, %eax
+; X86-AVX-NEXT:    vmovups 16(%esp,%eax), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $44, %esp
 ; X86-AVX-NEXT:    retl
@@ -2319,61 +2309,63 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT:    subl $44, %esp
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 8(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movb (%eax), %ah
-; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ah, %al
-; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 8(%eax), %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%eax), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, (%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %ecx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-NEXT:    andb $12, %ah
-; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %ah, %ebp
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 20(%esp,%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT:    andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 12(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 8(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 24(%esp,%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%esp,%ebx), %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ebx,%ebx), %ebp
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, (%esi)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, 8(%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, (%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
@@ -2382,50 +2374,48 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
 ; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    subl $32, %esp
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movb (%ecx), %ch
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%eax), %edx
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    andb $12, %ch
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl %ch, %ebx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%esp,%ebx), %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esp,%ebx), %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    andb $12, %al
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esp,%esi), %esi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebp, %edx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-NEXT:    addl $32, %esp
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
 ;
 ; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
@@ -2436,48 +2426,48 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %dl
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %dl, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %ebx, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 12(%edx)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%edi), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%esi,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%edi), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, (%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 4(%edx)
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
@@ -2487,51 +2477,49 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
 ; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    subl $32, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    andb $12, %al
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl %al, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esp,%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %eax, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 8(%ebx)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %esi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, 12(%ebx)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%ebx)
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%ebx)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    addl $32, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -2620,104 +2608,86 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: ashr_16bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $32, %esp
+; X86-SSE2-NEXT:    subl $44, %esp
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl (%edx), %esi
-; X86-SSE2-NEXT:    movl 4(%edx), %edi
-; X86-SSE2-NEXT:    movl 8(%edx), %ebx
-; X86-SSE2-NEXT:    movl 12(%edx), %edx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, (%esp)
-; X86-SSE2-NEXT:    sarl $31, %edx
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $3, %ecx
-; X86-SSE2-NEXT:    movl (%esp,%ecx,4), %edx
-; X86-SSE2-NEXT:    movl 4(%esp,%ecx,4), %esi
-; X86-SSE2-NEXT:    movl 12(%esp,%ecx,4), %edi
-; X86-SSE2-NEXT:    movl 8(%esp,%ecx,4), %ecx
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl %edi, 12(%eax)
-; X86-SSE2-NEXT:    movl %edx, (%eax)
-; X86-SSE2-NEXT:    movl %esi, 4(%eax)
-; X86-SSE2-NEXT:    addl $32, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    movl (%eax), %edx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %edx, (%esp)
+; X86-SSE2-NEXT:    sarl $31, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %ecx, 8(%edx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl %ecx, 12(%edx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-SSE2-NEXT:    movl %ecx, (%edx)
+; X86-SSE2-NEXT:    movl %eax, 4(%edx)
+; X86-SSE2-NEXT:    addl $44, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: ashr_16bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $32, %esp
+; X86-SSE42-NEXT:    subl $44, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movl (%edx), %esi
-; X86-SSE42-NEXT:    movl 4(%edx), %edi
-; X86-SSE42-NEXT:    movl 8(%edx), %ebx
-; X86-SSE42-NEXT:    movl 12(%edx), %edx
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 12(%eax), %ecx
+; X86-SSE42-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 8(%eax), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $3, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT:    movl (%eax), %edx
+; X86-SSE42-NEXT:    movl 4(%eax), %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %edx, (%esp)
+; X86-SSE42-NEXT:    sarl $31, %ecx
+; X86-SSE42-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    andl $3, %eax
+; X86-SSE42-NEXT:    movups (%esp,%eax,4), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $32, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
+; X86-SSE42-NEXT:    addl $44, %esp
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: ashr_16bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    subl $44, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl (%edx), %esi
-; X86-AVX-NEXT:    movl 4(%edx), %edi
-; X86-AVX-NEXT:    movl 8(%edx), %ebx
-; X86-AVX-NEXT:    movl 12(%edx), %edx
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
+; X86-AVX-NEXT:    movl 12(%eax), %ecx
+; X86-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 8(%eax), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $3, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT:    movl (%eax), %edx
+; X86-AVX-NEXT:    movl 4(%eax), %eax
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %edx, (%esp)
+; X86-AVX-NEXT:    sarl $31, %ecx
+; X86-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    andl $3, %eax
+; X86-AVX-NEXT:    vmovups (%esp,%eax,4), %xmm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $32, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
+; X86-AVX-NEXT:    addl $44, %esp
 ; X86-AVX-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %dwordOff = load i128, ptr %dwordOff.ptr, align 1
@@ -3172,122 +3142,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $108, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ebp), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb (%eax), %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %dh
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %dh
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %ah, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %dl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%edi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 28(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%eax), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 24(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 16(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%eax), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 20(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 4(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -3300,73 +3255,62 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $92, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb (%ecx), %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ebp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %ch, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %al, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%edx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -3379,95 +3323,93 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $108, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %bl, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ebp,%ebp), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, 32(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %dl, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%ebx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 20(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%ebx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%esi), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %eax, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%ebx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 28(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 24(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 20(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 12(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, 16(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -3480,74 +3422,63 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $92, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $76, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %bl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %bl, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -3560,102 +3491,100 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %al
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%ebp), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%edi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 24(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 16(%ebp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 20(%ebp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 8(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -3668,60 +3597,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %al
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 24(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -3734,82 +3657,81 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %dl, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %edx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 16(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 20(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 12(%ebx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, 16(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%ebx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -3822,59 +3744,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %al, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%ebx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -3887,99 +3805,97 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %al
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%ebp), %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%edi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 24(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%edx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 16(%ebp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 20(%ebp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 8(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -3993,57 +3909,51 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %al
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 24(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -4057,79 +3967,78 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %dl, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, 32(%esp,%ebx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %cl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %edx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 32(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 16(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 20(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 12(%ebx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, 16(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%ebx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
@@ -4143,56 +4052,52 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%ebx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 32(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
@@ -4657,83 +4562,67 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: lshr_32bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $92, %esp
+; X86-SSE2-NEXT:    subl $76, %esp
+; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-SSE2-NEXT:    movl 12(%eax), %edi
-; X86-SSE2-NEXT:    movl 16(%eax), %ebx
-; X86-SSE2-NEXT:    movl 20(%eax), %ebp
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
 ; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 20(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 16(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $7, %eax
-; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %esi
-; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edi
-; X86-SSE2-NEXT:    movl 36(%esp,%eax,4), %ebx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax,4), %ebp
-; X86-SSE2-NEXT:    movl 44(%esp,%eax,4), %edx
-; X86-SSE2-NEXT:    movl 40(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, (%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl %edx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $92, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $76, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: lshr_32bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $76, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
+; X86-SSE42-NEXT:    movups 16(%eax), %xmm1
 ; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $7, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx,4), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    andl $7, %eax
+; X86-SSE42-NEXT:    movups (%esp,%eax,4), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%eax,4), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $76, %esp
@@ -4742,17 +4631,17 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ; X86-AVX-LABEL: lshr_32bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX-NEXT:    andl $7, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,4), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    andl $7, %eax
+; X86-AVX-NEXT:    vmovups (%esp,%eax,4), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%eax,4), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $76, %esp
@@ -4826,83 +4715,67 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: lshr_32bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $92, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-SSE2-NEXT:    movl 12(%eax), %edi
-; X86-SSE2-NEXT:    movl 16(%eax), %ebx
-; X86-SSE2-NEXT:    movl 20(%eax), %ebp
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    subl $76, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 24(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $3, %eax
-; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %esi
-; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edi
-; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %ebx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %ebp
-; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
-; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, (%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl %edx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $92, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,8), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $76, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: lshr_32bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $76, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
+; X86-SSE42-NEXT:    movups 16(%eax), %xmm1
 ; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $3, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    andl $3, %eax
+; X86-SSE42-NEXT:    movups (%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%eax,8), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $76, %esp
@@ -4911,17 +4784,17 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X86-AVX-LABEL: lshr_32bytes_qwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX-NEXT:    andl $3, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    andl $3, %eax
+; X86-AVX-NEXT:    vmovups (%esp,%eax,8), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%eax,8), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $76, %esp
@@ -5388,119 +5261,115 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $108, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb (%eax), %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %ch
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    negb %ah
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %ah, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%ebx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    negb %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%ebp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 24(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 28(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 16(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%ebx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 20(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 84(%esp,%ebx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 80(%esp,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 8(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 92(%esp,%ebx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 88(%esp,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -5513,73 +5382,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $92, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb (%ecx), %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negb %ch
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %ch, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%eax), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 28(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 16(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 20(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, (%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 4(%eax)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negb %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movsbl %al, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -5592,105 +5452,89 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $108, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $92, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %bl
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %bl, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%esi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebp, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %cl, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%edi), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 80(%esp,%ebp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, %ebx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%edi), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, 92(%esp,%ecx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 88(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%edi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 24(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, 76(%esp,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -5703,75 +5547,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $92, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $76, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %bl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %bl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %bl, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negb %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movsbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 12(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %esi, %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -5784,106 +5617,99 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %cl, %dh
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %dh
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    negb %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movsbl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 84(%esp,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %dl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 80(%esp,%ebx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%ebx), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%ebx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%ebx), %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 24(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 88(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 92(%esp,%eax), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 4(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 16(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 20(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -5896,58 +5722,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $76, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negb %al
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movsbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%esi), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 28(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 4(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 8(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 12(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 16(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 20(%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%edx)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%esi), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 24(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%esi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%ebx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%esi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -5960,82 +5783,77 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %dl, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 84(%esp,%edx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 80(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, (%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, 76(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 28(%ebx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 24(%edi)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 12(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 16(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 88(%esp,%edx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 24(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -6049,56 +5867,52 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negb %al
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %al, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%esi), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 28(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 4(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 12(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 16(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 20(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%esi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 8(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%esi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 12(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 16(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 20(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%edi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
@@ -6112,103 +5926,96 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %cl, %dh
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %dh
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    negb %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    negb %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movsbl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 84(%esp,%ebx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %dl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 80(%esp,%ebx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 76(%esp,%ebx), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 72(%esp,%ebx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 72(%esp,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 68(%esp,%ebx), %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 68(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 24(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 64(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 88(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 92(%esp,%eax), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 4(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 8(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 64(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 16(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 20(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -6222,55 +6029,52 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $76, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    negb %al
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movsbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 64(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 68(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%esi), %edx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 72(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %esi, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 76(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 28(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 4(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 8(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 12(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 16(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 20(%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%edx)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%esi), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 24(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%esi), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%esi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 8(%ebx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%esi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%esi), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -6284,79 +6088,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %al
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    negb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %dl, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 84(%esp,%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    negb %cl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, (%ecx)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 80(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 76(%esp,%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 72(%esp,%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, 76(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 28(%ebx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 68(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 24(%edi)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 68(%esp,%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 64(%esp,%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 88(%esp,%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, 92(%esp,%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 4(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%esi), %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 24(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 12(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 64(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 16(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
@@ -6371,53 +6170,49 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    negb %al
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %al, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 64(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 68(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movsbl %al, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 72(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 76(%esp,%esi), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 72(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %esi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 76(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 28(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 4(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 12(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 16(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 20(%edx)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 68(%esp,%esi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%esi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%esi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 8(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%esi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 12(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 64(%esp,%esi), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 16(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 20(%edi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%edi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
@@ -6903,89 +6698,73 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ;
 ; X86-SSE2-LABEL: shl_32bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $92, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SSE2-NEXT:    movl (%ebp), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%ebp), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%ebp), %esi
-; X86-SSE2-NEXT:    movl 12(%ebp), %edi
-; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl 20(%ebp), %edx
-; X86-SSE2-NEXT:    movl 24(%ebp), %eax
-; X86-SSE2-NEXT:    movl 28(%ebp), %ebp
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    subl $76, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    shlb $2, %cl
-; X86-SSE2-NEXT:    andb $28, %cl
-; X86-SSE2-NEXT:    negb %cl
-; X86-SSE2-NEXT:    movsbl %cl, %edx
-; X86-SSE2-NEXT:    movl 48(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%edx), %esi
-; X86-SSE2-NEXT:    movl 56(%esp,%edx), %edi
-; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ebx
-; X86-SSE2-NEXT:    movl 64(%esp,%edx), %ebp
-; X86-SSE2-NEXT:    movl 76(%esp,%edx), %ecx
-; X86-SSE2-NEXT:    movl 72(%esp,%edx), %edx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %edx, 24(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $92, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    shlb $2, %al
+; X86-SSE2-NEXT:    andb $28, %al
+; X86-SSE2-NEXT:    negb %al
+; X86-SSE2-NEXT:    movsbl %al, %eax
+; X86-SSE2-NEXT:    movl 56(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 60(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 48(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 52(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 40(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 44(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl 32(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl 36(%esp,%eax), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $76, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: shl_32bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $76, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, (%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
+; X86-SSE42-NEXT:    movups 16(%eax), %xmm1
 ; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    shlb $2, %cl
-; X86-SSE42-NEXT:    andb $28, %cl
-; X86-SSE42-NEXT:    negb %cl
-; X86-SSE42-NEXT:    movsbl %cl, %ecx
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    shlb $2, %al
+; X86-SSE42-NEXT:    andb $28, %al
+; X86-SSE42-NEXT:    negb %al
+; X86-SSE42-NEXT:    movsbl %al, %eax
+; X86-SSE42-NEXT:    movups 32(%esp,%eax), %xmm0
+; X86-SSE42-NEXT:    movups 48(%esp,%eax), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $76, %esp
@@ -6994,20 +6773,20 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 ; X86-AVX-LABEL: shl_32bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
+; X86-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    shlb $2, %cl
-; X86-AVX-NEXT:    andb $28, %cl
-; X86-AVX-NEXT:    negb %cl
-; X86-AVX-NEXT:    movsbl %cl, %ecx
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    shlb $2, %al
+; X86-AVX-NEXT:    andb $28, %al
+; X86-AVX-NEXT:    negb %al
+; X86-AVX-NEXT:    movsbl %al, %eax
+; X86-AVX-NEXT:    vmovups 32(%esp,%eax), %xmm0
+; X86-AVX-NEXT:    vmovups 48(%esp,%eax), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $76, %esp
@@ -7090,89 +6869,73 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ;
 ; X86-SSE2-LABEL: shl_32bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $92, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SSE2-NEXT:    movl (%ebp), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%ebp), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%ebp), %esi
-; X86-SSE2-NEXT:    movl 12(%ebp), %edi
-; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
-; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE2-NEXT:    movl 20(%ebp), %edx
-; X86-SSE2-NEXT:    movl 24(%ebp), %eax
-; X86-SSE2-NEXT:    movl 28(%ebp), %ebp
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    subl $76, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    shlb $3, %cl
-; X86-SSE2-NEXT:    andb $24, %cl
-; X86-SSE2-NEXT:    negb %cl
-; X86-SSE2-NEXT:    movsbl %cl, %edx
-; X86-SSE2-NEXT:    movl 48(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%edx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%edx), %esi
-; X86-SSE2-NEXT:    movl 56(%esp,%edx), %edi
-; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ebx
-; X86-SSE2-NEXT:    movl 64(%esp,%edx), %ebp
-; X86-SSE2-NEXT:    movl 76(%esp,%edx), %ecx
-; X86-SSE2-NEXT:    movl 72(%esp,%edx), %edx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %edx, 24(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $92, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    shlb $3, %al
+; X86-SSE2-NEXT:    andb $24, %al
+; X86-SSE2-NEXT:    negb %al
+; X86-SSE2-NEXT:    movsbl %al, %eax
+; X86-SSE2-NEXT:    movl 56(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 60(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 48(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 52(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 40(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 44(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl 32(%esp,%eax), %edx
+; X86-SSE2-NEXT:    movl 36(%esp,%eax), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $76, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: shl_32bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $76, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, (%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
+; X86-SSE42-NEXT:    movups 16(%eax), %xmm1
 ; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    shlb $3, %cl
-; X86-SSE42-NEXT:    andb $24, %cl
-; X86-SSE42-NEXT:    negb %cl
-; X86-SSE42-NEXT:    movsbl %cl, %ecx
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    shlb $3, %al
+; X86-SSE42-NEXT:    andb $24, %al
+; X86-SSE42-NEXT:    negb %al
+; X86-SSE42-NEXT:    movsbl %al, %eax
+; X86-SSE42-NEXT:    movups 32(%esp,%eax), %xmm0
+; X86-SSE42-NEXT:    movups 48(%esp,%eax), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $76, %esp
@@ -7181,20 +6944,20 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ; X86-AVX-LABEL: shl_32bytes_qwordOff:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    subl $76, %esp
+; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
+; X86-AVX-NEXT:    vmovups (%eax), %ymm0
 ; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    shlb $3, %cl
-; X86-AVX-NEXT:    andb $24, %cl
-; X86-AVX-NEXT:    negb %cl
-; X86-AVX-NEXT:    movsbl %cl, %ecx
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    shlb $3, %al
+; X86-AVX-NEXT:    andb $24, %al
+; X86-AVX-NEXT:    negb %al
+; X86-AVX-NEXT:    movsbl %al, %eax
+; X86-AVX-NEXT:    vmovups 32(%esp,%eax), %xmm0
+; X86-AVX-NEXT:    vmovups 48(%esp,%eax), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX-NEXT:    addl $76, %esp
@@ -7701,125 +7464,113 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $108, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $92, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %dl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %al
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %ch
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%edi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 28(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%eax), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 24(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 16(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 20(%edi)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%eax), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 16(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 20(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, (%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 4(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -7832,81 +7583,68 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $92, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andb $28, %al
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movzbl %al, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%edx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -7919,101 +7657,99 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $108, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esi), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esi), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%edx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esi), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $92, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %dl, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %cl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ebp,%ebp), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, 32(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %dl, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ecx, %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%ebx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %edx, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 20(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%ebx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%esi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %eax, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%esi), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %eax, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ebp, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%ebx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 28(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 24(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 20(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 12(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, 16(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%edi)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -8026,82 +7762,69 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $92, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $76, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlb $3, %cl
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andb $28, %al
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %al, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ecx, %edi, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movzbl %al, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ecx, %edi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%esp,%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $76, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -8114,192 +7837,184 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $108, %esp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %al
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 28(%ecx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %cl, %dh
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%ebp), %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%ebp), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 24(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 16(%ebp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 4(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 24(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 16(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%ecx)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $108, %esp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    retl
-;
-; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 20(%ebp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 8(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%edi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $92, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
 ; X86-HAVE-SHLD-NO-BMI2-SSE4:       # %bb.0:
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $108, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $92, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%ecx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%ecx), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%ecx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andb $28, %al
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movzbl %al, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 24(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -8312,95 +8027,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $108, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%ecx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %dl, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %edx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 16(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 20(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 12(%ebx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %edi, 16(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%ebx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -8413,71 +8126,67 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $108, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%ecx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%ecx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%ecx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movzbl %al, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%ebx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -8490,114 +8199,112 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $108, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    sarl $31, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %al
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movzbl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 28(%ecx)
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %cl, %dh
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%ebp), %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%edi), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %ah
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %ah
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%ebp), %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebp, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 24(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%edx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 16(%ebp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dh, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    sarl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 4(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 24(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 16(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%ecx)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 20(%ebp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 8(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 12(%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ah, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%edi)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -8610,72 +8317,66 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $108, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $92, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%ecx), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%ecx), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%ecx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarl $31, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andb $28, %al
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movzbl %al, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 20(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 24(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
@@ -8688,95 +8389,93 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $108, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%ecx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %bl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %dl, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %eax, 32(%esp,%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebp, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %edx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebp, %edx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %cl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %cl, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %edx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 32(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 16(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 20(%edi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 12(%ebx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %edi, 16(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%ebx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
@@ -8789,71 +8488,67 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $108, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%ecx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%ecx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%ecx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shlb $3, %cl
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarl $31, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andb $28, %al
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movzbl %al, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%ebx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%ebx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%ebx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 24(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 20(%esp,%ebx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 40(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 32(%esp,%ebx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 36(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 16(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, (%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $92, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
@@ -9365,150 +9060,122 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $92, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %edi
-; X86-SSE2-NEXT:    movl 12(%eax), %ebx
-; X86-SSE2-NEXT:    movl 16(%eax), %ebp
-; X86-SSE2-NEXT:    movl 20(%eax), %esi
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    subl $76, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl 28(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 24(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    sarl $31, %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%ecx), %edx
+; X86-SSE2-NEXT:    movl 4(%ecx), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $7, %eax
-; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %esi
-; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edi
-; X86-SSE2-NEXT:    movl 36(%esp,%eax,4), %ebx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax,4), %ebp
-; X86-SSE2-NEXT:    movl 44(%esp,%eax,4), %edx
-; X86-SSE2-NEXT:    movl 40(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT:    movl %edx, (%esp)
+; X86-SSE2-NEXT:    sarl $31, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl %edx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $92, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $76, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: ashr_32bytes_dwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $64, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    subl $76, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movl 16(%edx), %esi
-; X86-SSE42-NEXT:    movl 20(%edx), %edi
-; X86-SSE42-NEXT:    movl 24(%edx), %ebx
-; X86-SSE42-NEXT:    movl 28(%edx), %edx
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 28(%ecx), %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 24(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 20(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 16(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $7, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx,4), %xmm1
+; X86-SSE42-NEXT:    movups (%ecx), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    sarl $31, %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    andl $7, %eax
+; X86-SSE42-NEXT:    movups (%esp,%eax,4), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%eax,4), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $64, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
+; X86-SSE42-NEXT:    addl $76, %esp
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: ashr_32bytes_dwordOff:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $64, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    subl $76, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movl 16(%edx), %esi
-; X86-AVX-NEXT:    movl 20(%edx), %edi
-; X86-AVX-NEXT:    movl 24(%edx), %ebx
-; X86-AVX-NEXT:    movl 28(%edx), %edx
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 28(%ecx), %eax
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 24(%ecx), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 20(%ecx), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 16(%ecx), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $7, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,4), %xmm1
+; X86-AVX-NEXT:    vmovups (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT:    sarl $31, %eax
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    andl $7, %eax
+; X86-AVX-NEXT:    vmovups (%esp,%eax,4), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%eax,4), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $64, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
+; X86-AVX-NEXT:    addl $76, %esp
 ; X86-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -9590,150 +9257,122 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $92, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %edi
-; X86-SSE2-NEXT:    movl 12(%eax), %ebx
-; X86-SSE2-NEXT:    movl 16(%eax), %ebp
-; X86-SSE2-NEXT:    movl 20(%eax), %esi
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    subl $76, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl 28(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 24(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    sarl $31, %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%ecx), %edx
+; X86-SSE2-NEXT:    movl 4(%ecx), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $3, %eax
-; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %esi
-; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edi
-; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %ebx
-; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %ebp
-; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
-; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %edx, (%esp)
+; X86-SSE2-NEXT:    sarl $31, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl %edx, 28(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
-; X86-SSE2-NEXT:    movl %edi, 8(%eax)
-; X86-SSE2-NEXT:    movl %esi, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $92, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movzbl (%eax), %eax
+; X86-SSE2-NEXT:    andl $3, %eax
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,8), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $76, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: ashr_32bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $64, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    subl $76, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movl 16(%edx), %esi
-; X86-SSE42-NEXT:    movl 20(%edx), %edi
-; X86-SSE42-NEXT:    movl 24(%edx), %ebx
-; X86-SSE42-NEXT:    movl 28(%edx), %edx
-; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 28(%ecx), %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 24(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 20(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 16(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $3, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT:    movups (%ecx), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    sarl $31, %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movzbl (%eax), %eax
+; X86-SSE42-NEXT:    andl $3, %eax
+; X86-SSE42-NEXT:    movups (%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%eax,8), %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $64, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
+; X86-SSE42-NEXT:    addl $76, %esp
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: ashr_32bytes_qwordOff:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $64, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    subl $76, %esp
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX-NEXT:    movl 16(%edx), %esi
-; X86-AVX-NEXT:    movl 20(%edx), %edi
-; X86-AVX-NEXT:    movl 24(%edx), %ebx
-; X86-AVX-NEXT:    movl 28(%edx), %edx
-; X86-AVX-NEXT:    movzbl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
+; X86-AVX-NEXT:    movl 28(%ecx), %eax
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 24(%ecx), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 20(%ecx), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl 16(%ecx), %edx
 ; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $3, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT:    vmovups (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT:    sarl $31, %eax
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movzbl (%eax), %eax
+; X86-AVX-NEXT:    andl $3, %eax
+; X86-AVX-NEXT:    vmovups (%esp,%eax,8), %xmm0
+; X86-AVX-NEXT:    vmovups 16(%esp,%eax,8), %xmm1
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $64, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
+; X86-AVX-NEXT:    addl $76, %esp
 ; X86-AVX-NEXT:    retl
   %src = load i256, ptr %src.ptr, align 1
   %qwordOff = load i256, ptr %qwordOff.ptr, align 1
@@ -10672,244 +10311,203 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 48(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 8(%eax)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%edi), %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -10922,153 +10520,109 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ecx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ecx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%ecx), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ecx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%eax), %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 56(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 60(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 48(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 52(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 40(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 32(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 36(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -11081,200 +10635,164 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%ebx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%esi,8), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $24, %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, 64(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 80(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%esi), %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 52(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 88(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 44(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 84(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 96(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 92(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 36(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 104(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 100(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 112(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 108(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 12(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, 16(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 120(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 116(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 124(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebp, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -11287,151 +10805,109 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%ecx), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%ecx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%ecx), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%ecx), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%edx), %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 56(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 48(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -11444,200 +10920,179 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $156, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 48(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %bl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -11650,109 +11105,85 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $140, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%eax), %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 56(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 60(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 48(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 52(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 40(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 32(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 36(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -11765,154 +11196,140 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $156, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%ebx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%esi,8), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $24, %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, 64(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 80(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%esi), %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 52(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 88(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 44(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 84(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 96(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 92(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 36(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 104(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 100(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 112(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 108(%esp,%ebx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 12(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, 16(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 120(%esp,%ebx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 116(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 124(%esp,%ebx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebp, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -11925,107 +11342,85 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $156, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%edx), %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 56(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 48(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -12038,195 +11433,173 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $156, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%edi), %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %cl, %ch
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 48(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 8(%eax)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%esp,%edi), %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
@@ -12240,103 +11613,79 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $140, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%eax), %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 56(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 60(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 48(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 52(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 40(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 32(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 36(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 8(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
@@ -12350,143 +11699,134 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $156, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%ecx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%esi,8), %edx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 52(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 44(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 36(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 120(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 116(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 124(%esp,%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 60(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 56(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 48(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 52(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 40(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 44(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 32(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 24(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 12(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, 16(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
@@ -12500,101 +11840,79 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $156, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%edx), %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 56(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 48(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%esp,%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
@@ -12605,27 +11923,27 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes:
 ; X86-NO-BMI2-AVX512:       # %bb.0:
 ; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%ecx), %zmm0
-; X86-NO-BMI2-AVX512-NEXT:    movl (%edx), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NO-BMI2-AVX512-NEXT:    vpslld $3, %xmm1, %xmm1
-; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
-; X86-NO-BMI2-AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-NO-BMI2-AVX512-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
 ; X86-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
-; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
-; X86-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
-; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; X86-NO-BMI2-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
-; X86-NO-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
-; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
-; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm3
+; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; X86-NO-BMI2-AVX512-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-NO-BMI2-AVX512-NEXT:    retl
@@ -12633,26 +11951,26 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
 ; X86-HAVE-BMI2-AVX512:       # %bb.0:
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
-; X86-HAVE-BMI2-AVX512-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-HAVE-BMI2-AVX512-NEXT:    vpslld $3, %xmm1, %xmm1
-; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
-; X86-HAVE-BMI2-AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
-; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
-; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
-; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
-; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm2, %xmm1, %xmm1
-; X86-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
-; X86-HAVE-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
-; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
-; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm3
+; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-HAVE-BMI2-AVX512-NEXT:    retl
@@ -12789,187 +12107,139 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $188, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    subl $140, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 60(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 56(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 52(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 48(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 44(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 40(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 36(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 32(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 24(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 20(%eax), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $7, %eax
-; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 68(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 64(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 76(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 72(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 84(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 80(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 92(%esp,%eax,8), %ebp
-; X86-SSE2-NEXT:    movl 88(%esp,%eax,8), %ebx
-; X86-SSE2-NEXT:    movl 100(%esp,%eax,8), %edi
-; X86-SSE2-NEXT:    movl 96(%esp,%eax,8), %esi
-; X86-SSE2-NEXT:    movl 108(%esp,%eax,8), %edx
-; X86-SSE2-NEXT:    movl 104(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %ecx, (%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
-; X86-SSE2-NEXT:    movl %edx, 60(%eax)
-; X86-SSE2-NEXT:    movl %esi, 48(%eax)
-; X86-SSE2-NEXT:    movl %edi, 52(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $188, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 56(%ecx)
+; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 60(%ecx)
+; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 48(%ecx)
+; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 52(%ecx)
+; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 40(%ecx)
+; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 44(%ecx)
+; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 32(%ecx)
+; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 36(%ecx)
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,8), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $140, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: lshr_64bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $140, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
-; X86-SSE42-NEXT:    movl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups 48(%eax), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups 32(%eax), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
+; X86-SSE42-NEXT:    movups 16(%eax), %xmm1
 ; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    andl $7, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx,8), %xmm2
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx,8), %xmm3
-; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movl (%eax), %eax
+; X86-SSE42-NEXT:    andl $7, %eax
+; X86-SSE42-NEXT:    movups 48(%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT:    movups %xmm0, 48(%ecx)
+; X86-SSE42-NEXT:    movups 32(%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movups %xmm0, 32(%ecx)
+; X86-SSE42-NEXT:    movups (%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%eax,8), %xmm1
+; X86-SSE42-NEXT:    movups %xmm1, 16(%ecx)
+; X86-SSE42-NEXT:    movups %xmm0, (%ecx)
 ; X86-SSE42-NEXT:    addl $140, %esp
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: lshr_64bytes_qwordOff:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    subl $140, %esp
+; X86-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX1-NEXT:    vmovups 32(%edx), %ymm1
-; X86-AVX1-NEXT:    movl (%ecx), %ecx
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT:    andl $7, %ecx
-; X86-AVX1-NEXT:    vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl (%eax), %eax
+; X86-AVX1-NEXT:    andl $7, %eax
+; X86-AVX1-NEXT:    vmovups 48(%esp,%eax,8), %xmm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    vmovups %xmm0, 48(%ecx)
+; X86-AVX1-NEXT:    vmovups 32(%esp,%eax,8), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, 32(%ecx)
+; X86-AVX1-NEXT:    vmovups (%esp,%eax,8), %xmm0
+; X86-AVX1-NEXT:    vmovups 16(%esp,%eax,8), %xmm1
+; X86-AVX1-NEXT:    vmovups %xmm1, 16(%ecx)
+; X86-AVX1-NEXT:    vmovups %xmm0, (%ecx)
 ; X86-AVX1-NEXT:    addl $140, %esp
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
@@ -12977,14 +12247,14 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
 ; X86-NO-BMI2-AVX512:       # %bb.0:
 ; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
-; X86-NO-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
-; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-NO-BMI2-AVX512-NEXT:    retl
@@ -12992,14 +12262,14 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
 ; X86-HAVE-BMI2-AVX512:       # %bb.0:
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
-; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
 ; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-HAVE-BMI2-AVX512-NEXT:    retl
@@ -13957,256 +13227,212 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ebp), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ebp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %ch
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %ch
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 56(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%ebp), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 60(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    negl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 128(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 48(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 52(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%ebp), %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 40(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 44(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 32(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%edx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 36(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ebp), %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 24(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 28(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    negl %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 176(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 16(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 20(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 8(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -14219,153 +13445,113 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ecx), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ecx), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%ecx), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ecx), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl %ebp, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%edx), %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 48(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 24(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    negl %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 160(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -14378,211 +13564,168 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ebp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%ebp,8), %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $24, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebp, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%edi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 56(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, 140(%esp,%ebp), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 60(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 48(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 52(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 40(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 32(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 36(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 20(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 8(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %esi, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebp, %edi, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    negl %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 56(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 60(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 48(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 52(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 40(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 44(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 32(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 36(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -14595,154 +13738,111 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%ebp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $140, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%ebp,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 24(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 12(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    negl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 176(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %ebp, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shldl %cl, %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %ecx, %edi, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -14755,212 +13855,188 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $156, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%ebp), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%ebp), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%edi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ch
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 56(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%ebp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 60(%ebx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    negl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 128(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 48(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 52(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%ebp), %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 40(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%edx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 44(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 32(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 36(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 24(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    negl %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 176(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 28(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 16(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 20(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 8(%edx)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -14973,109 +14049,89 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $140, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl %ebp, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%edx), %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 48(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 24(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 16(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    negl %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 160(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -15088,168 +14144,144 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $156, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%eax,8), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%ebp,8), %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $24, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 4(%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 4(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 8(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 12(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%edi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 56(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, 140(%esp,%ebp), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 60(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 48(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 52(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 40(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edi, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 32(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 36(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 12(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 20(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 8(%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 8(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %esi, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %esi, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebp, %edi, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    negl %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 56(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 60(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 48(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 52(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 40(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 44(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 32(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 36(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -15262,110 +14294,87 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $140, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%ecx), %xmm3
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    xorps %xmm4, %xmm4
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 48(%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%ebp,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 4(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 16(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 12(%eax)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    negl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 176(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %ebp, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shldl %cl, %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 40(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %ecx, %edi, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -15378,206 +14387,182 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $156, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%ebp), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 4(%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%ebp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %ch
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %ch
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 12(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%ebx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%ebp), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 8(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 60(%ebx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    negl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 128(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 48(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 52(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%ebp), %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 40(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 44(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 32(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%edx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 36(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%ebp), %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 24(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 28(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    negl %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 176(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 12(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 16(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 20(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 8(%edx)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
@@ -15591,103 +14576,83 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $140, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl %ebp, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%edx), %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 4(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    negl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 48(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 40(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 24(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 16(%eax)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    negl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 160(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
@@ -15701,162 +14666,138 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $156, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%eax,8), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%ebp,8), %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 4(%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 4(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ecx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 8(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 12(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ecx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%edi), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 56(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    negl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, 140(%esp,%ebp), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 16(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edi, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 60(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 48(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 52(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 40(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 32(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 36(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%edi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %eax, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 12(%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 16(%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 16(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 20(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 8(%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 8(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %esi, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %esi, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ebp, %edi, %ecx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    negl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 56(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 60(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 48(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 52(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 40(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 44(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 32(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
@@ -15870,104 +14811,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $140, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, (%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %ymm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%ebx,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl %ebx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 4(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebp, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    negl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 176(%esp,%ebx), %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ecx, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    negl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %ecx, %edi, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
@@ -15977,21 +14895,19 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-AVX512-LABEL: shl_64bytes:
 ; X86-NO-BMI2-AVX512:       # %bb.0:
-; X86-NO-BMI2-AVX512-NEXT:    pushl %esi
 ; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-AVX512-NEXT:    movl (%esi), %ecx
 ; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NO-BMI2-AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
 ; X86-NO-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-NO-BMI2-AVX512-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
 ; X86-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
-; X86-NO-BMI2-AVX512-NEXT:    vpexpandq (%edx), %zmm3 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    vpexpandq (%eax), %zmm3 {%k1} {z}
 ; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
 ; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -15999,27 +14915,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-AVX512-NEXT:    vpsrlq $1, %zmm1, %zmm1
 ; X86-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
-; X86-NO-BMI2-AVX512-NEXT:    popl %esi
 ; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-NO-BMI2-AVX512-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
 ; X86-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-BMI2-AVX512-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl (%edx), %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %esi
-; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %esi, %edx, %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %edx, %k1
-; X86-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%ecx), %zmm3 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%eax), %zmm3 {%k1} {z}
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -16027,8 +14941,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq $1, %zmm1, %zmm1
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
-; X86-HAVE-BMI2-AVX512-NEXT:    popl %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
@@ -16171,167 +15085,121 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ;
 ; X86-SSE2-LABEL: shl_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $188, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%ecx), %eax
-; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%ecx), %ebp
-; X86-SSE2-NEXT:    movl 44(%ecx), %ebx
-; X86-SSE2-NEXT:    movl 48(%ecx), %edi
-; X86-SSE2-NEXT:    movl 52(%ecx), %esi
-; X86-SSE2-NEXT:    movl 56(%ecx), %edx
-; X86-SSE2-NEXT:    movl 60(%ecx), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-SSE2-NEXT:    subl $136, %esp
 ; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl 60(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 56(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 52(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 48(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 44(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 40(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 36(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 32(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 28(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 24(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 20(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 16(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%eax), %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%eax), %ecx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
 ; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    shll $3, %ecx
-; X86-SSE2-NEXT:    andl $56, %ecx
-; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    subl %ecx, %eax
+; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl (%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %edx
-; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 44(%eax), %ebp
-; X86-SSE2-NEXT:    movl 40(%eax), %ebx
-; X86-SSE2-NEXT:    movl 52(%eax), %edi
+; X86-SSE2-NEXT:    shll $3, %edx
+; X86-SSE2-NEXT:    andl $56, %edx
+; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    subl %edx, %eax
+; X86-SSE2-NEXT:    movl 56(%eax), %esi
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %esi, 56(%ecx)
 ; X86-SSE2-NEXT:    movl 60(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    negl %ecx
-; X86-SSE2-NEXT:    movl 160(%esp,%ecx), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-SSE2-NEXT:    movl %esi, 60(%eax)
-; X86-SSE2-NEXT:    movl %ecx, 48(%eax)
-; X86-SSE2-NEXT:    movl %edi, 52(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $188, %esp
+; X86-SSE2-NEXT:    movl %esi, 60(%ecx)
+; X86-SSE2-NEXT:    negl %edx
+; X86-SSE2-NEXT:    movl 112(%esp,%edx), %edx
+; X86-SSE2-NEXT:    movl %edx, 48(%ecx)
+; X86-SSE2-NEXT:    movl 52(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 52(%ecx)
+; X86-SSE2-NEXT:    movl 40(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 40(%ecx)
+; X86-SSE2-NEXT:    movl 44(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 44(%ecx)
+; X86-SSE2-NEXT:    movl 32(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 32(%ecx)
+; X86-SSE2-NEXT:    movl 36(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 36(%ecx)
+; X86-SSE2-NEXT:    movl 24(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%eax), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%eax), %edx
+; X86-SSE2-NEXT:    movl 4(%eax), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $136, %esp
 ; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: shl_64bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    subl $140, %esp
+; X86-SSE42-NEXT:    xorps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
-; X86-SSE42-NEXT:    movl (%ecx), %ecx
-; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm4, (%esp)
-; X86-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups 48(%eax), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups 32(%eax), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups (%eax), %xmm0
+; X86-SSE42-NEXT:    movups 16(%eax), %xmm1
 ; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    shll $3, %ecx
-; X86-SSE42-NEXT:    andl $56, %ecx
-; X86-SSE42-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    subl %ecx, %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    negl %ecx
-; X86-SSE42-NEXT:    movups 112(%esp,%ecx), %xmm3
-; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movl (%eax), %eax
+; X86-SSE42-NEXT:    shll $3, %eax
+; X86-SSE42-NEXT:    andl $56, %eax
+; X86-SSE42-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT:    subl %eax, %ecx
+; X86-SSE42-NEXT:    negl %eax
+; X86-SSE42-NEXT:    movups 112(%esp,%eax), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movups %xmm0, 48(%eax)
+; X86-SSE42-NEXT:    movups 32(%ecx), %xmm0
+; X86-SSE42-NEXT:    movups %xmm0, 32(%eax)
+; X86-SSE42-NEXT:    movups (%ecx), %xmm0
+; X86-SSE42-NEXT:    movups 16(%ecx), %xmm1
 ; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
 ; X86-SSE42-NEXT:    movups %xmm0, (%eax)
 ; X86-SSE42-NEXT:    addl $140, %esp
@@ -16340,28 +15208,28 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ; X86-AVX1-LABEL: shl_64bytes_qwordOff:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    subl $140, %esp
+; X86-AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX1-NEXT:    vmovups 32(%edx), %ymm1
-; X86-AVX1-NEXT:    movl (%ecx), %ecx
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm2, (%esp)
+; X86-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX1-NEXT:    vmovups 32(%eax), %ymm1
 ; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    shll $3, %ecx
-; X86-AVX1-NEXT:    andl $56, %ecx
-; X86-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    subl %ecx, %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX1-NEXT:    vmovups 16(%edx), %xmm1
-; X86-AVX1-NEXT:    vmovups 32(%edx), %xmm2
-; X86-AVX1-NEXT:    negl %ecx
-; X86-AVX1-NEXT:    vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl (%eax), %eax
+; X86-AVX1-NEXT:    shll $3, %eax
+; X86-AVX1-NEXT:    andl $56, %eax
+; X86-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    subl %eax, %ecx
+; X86-AVX1-NEXT:    negl %eax
+; X86-AVX1-NEXT:    vmovups 112(%esp,%eax), %xmm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovups %xmm0, 48(%eax)
+; X86-AVX1-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, 32(%eax)
+; X86-AVX1-NEXT:    vmovups (%ecx), %xmm0
+; X86-AVX1-NEXT:    vmovups 16(%ecx), %xmm1
 ; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
 ; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
 ; X86-AVX1-NEXT:    addl $140, %esp
@@ -16370,33 +15238,29 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ;
 ; X86-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
 ; X86-NO-BMI2-AVX512:       # %bb.0:
-; X86-NO-BMI2-AVX512-NEXT:    pushl %esi
 ; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
-; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
-; X86-NO-BMI2-AVX512-NEXT:    vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    vpexpandq (%eax), %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
-; X86-NO-BMI2-AVX512-NEXT:    popl %esi
 ; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-NO-BMI2-AVX512-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
 ; X86-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-BMI2-AVX512-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %esi
-; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
-; X86-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%eax), %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
-; X86-HAVE-BMI2-AVX512-NEXT:    popl %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
@@ -17424,264 +16288,215 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%ecx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ebp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%edi), %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%esi), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    notb %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 48(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 84(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 88(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 92(%esp,%edx), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 96(%esp,%eax), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 100(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 104(%esp,%edx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 108(%esp,%ebp), %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 112(%esp,%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 116(%esp,%edx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 120(%esp,%edx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edx,%edx), %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 8(%eax)
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    leal (%edi,%edi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -17694,62 +16509,40 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    subl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl $31, %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -17767,92 +16560,70 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ecx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $60, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 56(%esp,%eax), %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 56(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 60(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 48(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 52(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 40(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 40(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 32(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 32(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 36(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 8(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT:    popl %ebx
@@ -17865,211 +16636,176 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $156, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (,%esi,8), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 52(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 44(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 36(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 120(%esp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 116(%esp,%ebp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 124(%esp,%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, 12(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, 16(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    shlxl %edx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -18082,61 +16818,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    subl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 12(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 8(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarl $31, %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -18155,90 +16869,70 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $60, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%edx), %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 56(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 48(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 40(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 24(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 20(%esp,%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edx, 4(%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    addl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT:    popl %ebx
@@ -18251,218 +16945,197 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    notb %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 48(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 16(%eax)
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %ebx
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -18475,24 +17148,22 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    subl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm2
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%ecx), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm0
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarl $31, %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -18510,92 +17181,70 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ecx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $60, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 56(%esp,%eax), %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 56(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 60(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 48(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 52(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 40(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 40(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 32(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 32(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 36(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 8(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT:    popl %ebx
@@ -18608,173 +17257,158 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm2
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (,%esi,8), %edx
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 52(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 44(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 36(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 120(%esp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 116(%esp,%ebp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 124(%esp,%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, 12(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, 16(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    shlxl %edx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -18787,22 +17421,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    subl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%eax), %xmm0
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%eax), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%eax), %xmm2
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%ecx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 32(%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movups 16(%ecx), %xmm1
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarl $31, %eax
@@ -18822,90 +17454,70 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $60, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%edx), %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 56(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 48(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 40(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 24(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 20(%esp,%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    addl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebx
@@ -18918,216 +17530,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %ecx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 48(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 8(%eax)
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%esp,%edi), %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%esi)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 4(%esi)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
 ; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
@@ -19141,22 +17732,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%ecx), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%ecx), %edx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, (%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarl $31, %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -19174,92 +17763,70 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%eax), %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 56(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarl %cl, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 60(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 48(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 52(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 40(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 40(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 44(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 32(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 32(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 36(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 8(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, (%edx)
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $140, %esp
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
 ; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
@@ -19273,171 +17840,156 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarl $31, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%esi,8), %edx
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxl %ecx, %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %ebp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 48(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 52(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 44(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 36(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 28(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 20(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%esp,%esi), %eax
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 120(%esp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 116(%esp,%ebp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 124(%esp,%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 12(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, 16(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, (%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ecx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%ebp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
 ; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
@@ -19451,21 +18003,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%ecx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%ecx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarl $31, %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -19484,90 +18034,70 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%edx), %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shll $3, %ecx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 56(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 48(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 44(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 40(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 32(%esp,%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 24(%esp,%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 20(%esp,%edx), %edx
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, (%esi)
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $156, %esp
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
@@ -19577,20 +18107,18 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes:
 ; X86-NO-BMI2-AVX512:       # %bb.0:
-; X86-NO-BMI2-AVX512-NEXT:    pushl %esi
 ; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%ecx), %zmm1
-; X86-NO-BMI2-AVX512-NEXT:    movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm1
 ; X86-NO-BMI2-AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
 ; X86-NO-BMI2-AVX512-NEXT:    vpermq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
 ; X86-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2
 ; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm1, %zmm2 {%k1}
 ; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -19603,26 +18131,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm3, %xmm1, %xmm1
 ; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
-; X86-NO-BMI2-AVX512-NEXT:    popl %esi
 ; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-NO-BMI2-AVX512-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
 ; X86-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-BMI2-AVX512-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm1
-; X86-HAVE-BMI2-AVX512-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm1
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpermq %zmm0, %zmm2, %zmm0
-; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %esi
-; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm1, %zmm2 {%k1}
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -19635,8 +18161,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm3, %xmm1, %xmm1
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
-; X86-HAVE-BMI2-AVX512-NEXT:    popl %esi
 ; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
@@ -19798,269 +18324,209 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ;
 ; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %ebp
-; X86-SSE2-NEXT:    pushl %ebx
-; X86-SSE2-NEXT:    pushl %edi
-; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    subl $188, %esp
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 4(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 8(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 12(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 16(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 20(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 24(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 28(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 32(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 36(%eax), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 40(%eax), %ebp
-; X86-SSE2-NEXT:    movl 44(%eax), %ebx
-; X86-SSE2-NEXT:    movl 48(%eax), %edi
-; X86-SSE2-NEXT:    movl 52(%eax), %esi
-; X86-SSE2-NEXT:    movl 56(%eax), %edx
-; X86-SSE2-NEXT:    movl 60(%eax), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    subl $140, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl 60(%ecx), %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 56(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 52(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 48(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 44(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 40(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 36(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 32(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 28(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 24(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 20(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT:    movl 16(%ecx), %edx
 ; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    sarl $31, %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 12(%ecx), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl 8(%ecx), %edx
+; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl (%ecx), %edx
+; X86-SSE2-NEXT:    movl 4(%ecx), %ecx
 ; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    andl $7, %eax
-; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 68(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 64(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 76(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 72(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 84(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 80(%esp,%eax,8), %ecx
-; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT:    movl 92(%esp,%eax,8), %ebp
-; X86-SSE2-NEXT:    movl 88(%esp,%eax,8), %ebx
-; X86-SSE2-NEXT:    movl 100(%esp,%eax,8), %edi
-; X86-SSE2-NEXT:    movl 96(%esp,%eax,8), %esi
-; X86-SSE2-NEXT:    movl 108(%esp,%eax,8), %edx
-; X86-SSE2-NEXT:    movl 104(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT:    movl %edx, (%esp)
+; X86-SSE2-NEXT:    sarl $31, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
-; X86-SSE2-NEXT:    movl %edx, 60(%eax)
-; X86-SSE2-NEXT:    movl %esi, 48(%eax)
-; X86-SSE2-NEXT:    movl %edi, 52(%eax)
-; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
-; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, (%eax)
-; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
-; X86-SSE2-NEXT:    addl $188, %esp
-; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
-; X86-SSE2-NEXT:    popl %ebx
-; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    movl (%eax), %eax
+; X86-SSE2-NEXT:    andl $7, %eax
+; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movl %edx, 56(%ecx)
+; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 60(%ecx)
+; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 48(%ecx)
+; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 52(%ecx)
+; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 40(%ecx)
+; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 44(%ecx)
+; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 32(%ecx)
+; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 36(%ecx)
+; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 24(%ecx)
+; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 28(%ecx)
+; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 16(%ecx)
+; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 20(%ecx)
+; X86-SSE2-NEXT:    movl 8(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 8(%ecx)
+; X86-SSE2-NEXT:    movl 12(%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
+; X86-SSE2-NEXT:    movl (%esp,%eax,8), %edx
+; X86-SSE2-NEXT:    movl 4(%esp,%eax,8), %eax
+; X86-SSE2-NEXT:    movl %edx, (%ecx)
+; X86-SSE2-NEXT:    movl %eax, 4(%ecx)
+; X86-SSE2-NEXT:    addl $140, %esp
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: ashr_64bytes_qwordOff:
 ; X86-SSE42:       # %bb.0:
-; X86-SSE42-NEXT:    pushl %ebx
-; X86-SSE42-NEXT:    pushl %edi
-; X86-SSE42-NEXT:    pushl %esi
-; X86-SSE42-NEXT:    subl $128, %esp
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    subl $140, %esp
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE42-NEXT:    movups (%edx), %xmm0
-; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
-; X86-SSE42-NEXT:    movl 48(%edx), %esi
-; X86-SSE42-NEXT:    movl 52(%edx), %edi
-; X86-SSE42-NEXT:    movl 56(%edx), %ebx
-; X86-SSE42-NEXT:    movl 60(%edx), %edx
-; X86-SSE42-NEXT:    movl (%ecx), %ecx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
-; X86-SSE42-NEXT:    sarl $31, %edx
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 60(%ecx), %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 56(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 52(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl 48(%ecx), %edx
 ; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT:    andl $7, %ecx
-; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
-; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
-; X86-SSE42-NEXT:    movups 32(%esp,%ecx,8), %xmm2
-; X86-SSE42-NEXT:    movups 48(%esp,%ecx,8), %xmm3
-; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
-; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
-; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
-; X86-SSE42-NEXT:    movups %xmm0, (%eax)
-; X86-SSE42-NEXT:    addl $128, %esp
-; X86-SSE42-NEXT:    popl %esi
-; X86-SSE42-NEXT:    popl %edi
-; X86-SSE42-NEXT:    popl %ebx
+; X86-SSE42-NEXT:    movups 32(%ecx), %xmm0
+; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movups (%ecx), %xmm0
+; X86-SSE42-NEXT:    movups 16(%ecx), %xmm1
+; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
+; X86-SSE42-NEXT:    sarl $31, %eax
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT:    movl (%eax), %eax
+; X86-SSE42-NEXT:    andl $7, %eax
+; X86-SSE42-NEXT:    movups 48(%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT:    movups %xmm0, 48(%ecx)
+; X86-SSE42-NEXT:    movups 32(%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movups %xmm0, 32(%ecx)
+; X86-SSE42-NEXT:    movups (%esp,%eax,8), %xmm0
+; X86-SSE42-NEXT:    movups 16(%esp,%eax,8), %xmm1
+; X86-SSE42-NEXT:    movups %xmm1, 16(%ecx)
+; X86-SSE42-NEXT:    movups %xmm0, (%ecx)
+; X86-SSE42-NEXT:    addl $140, %esp
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: ashr_64bytes_qwordOff:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    pushl %ebx
-; X86-AVX1-NEXT:    pushl %edi
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    subl $128, %esp
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    subl $140, %esp
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX1-NEXT:    vmovups 32(%edx), %xmm1
-; X86-AVX1-NEXT:    movl 48(%edx), %esi
-; X86-AVX1-NEXT:    movl 52(%edx), %edi
-; X86-AVX1-NEXT:    movl 56(%edx), %ebx
-; X86-AVX1-NEXT:    movl 60(%edx), %edx
-; X86-AVX1-NEXT:    movl (%ecx), %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT:    sarl $31, %edx
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl 60(%ecx), %eax
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl 56(%ecx), %edx
 ; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl 52(%ecx), %edx
 ; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl 48(%ecx), %edx
 ; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT:    andl $7, %ecx
-; X86-AVX1-NEXT:    vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT:    addl $128, %esp
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    popl %edi
-; X86-AVX1-NEXT:    popl %ebx
+; X86-AVX1-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-AVX1-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    vmovups (%ecx), %ymm0
+; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
+; X86-AVX1-NEXT:    sarl $31, %eax
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl (%eax), %eax
+; X86-AVX1-NEXT:    andl $7, %eax
+; X86-AVX1-NEXT:    vmovups 48(%esp,%eax,8), %xmm0
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    vmovups %xmm0, 48(%ecx)
+; X86-AVX1-NEXT:    vmovups 32(%esp,%eax,8), %xmm0
+; X86-AVX1-NEXT:    vmovups %xmm0, 32(%ecx)
+; X86-AVX1-NEXT:    vmovups (%esp,%eax,8), %xmm0
+; X86-AVX1-NEXT:    vmovups 16(%esp,%eax,8), %xmm1
+; X86-AVX1-NEXT:    vmovups %xmm1, 16(%ecx)
+; X86-AVX1-NEXT:    vmovups %xmm0, (%ecx)
+; X86-AVX1-NEXT:    addl $140, %esp
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
 ; X86-NO-BMI2-AVX512:       # %bb.0:
 ; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
-; X86-NO-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm0
 ; X86-NO-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
 ; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
 ; X86-NO-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
-; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
-; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
 ; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm1, (%eax)
 ; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-NO-BMI2-AVX512-NEXT:    retl
@@ -20068,17 +18534,17 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
 ; X86-HAVE-BMI2-AVX512:       # %bb.0:
 ; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
-; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%eax), %zmm0
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
-; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
-; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
 ; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm1, (%eax)
 ; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
 ; X86-HAVE-BMI2-AVX512-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 221a51ed44696..2f7a901cd6a9c 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -27,22 +27,22 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-LABEL: lshr_4bytes:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: lshr_4bytes:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X86-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %bitOff = load i32, ptr %bitOff.ptr, align 1
@@ -69,22 +69,22 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-LABEL: shl_4bytes:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: shl_4bytes:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shlxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-BMI2-NEXT:    shlxl %edx, (%ecx), %ecx
-; X86-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %bitOff = load i32, ptr %bitOff.ptr, align 1
@@ -111,22 +111,22 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-LABEL: ashr_4bytes:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-NEXT:    sarl %cl, %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
-; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NEXT:    sarl %cl, %edx
-; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: ashr_4bytes:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    sarxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-BMI2-NEXT:    sarxl %edx, (%ecx), %ecx
-; X86-BMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI2-NEXT:    movl %eax, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %src = load i32, ptr %src.ptr, align 1
   %bitOff = load i32, ptr %bitOff.ptr, align 1
@@ -153,105 +153,96 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_8bytes:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: lshr_8bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: lshr_8bytes:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_8bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %bitOff = load i64, ptr %bitOff.ptr, align 1
@@ -277,106 +268,98 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: shl_8bytes:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_8bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorl %edx, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ecx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_8bytes:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 4(%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_8bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %edi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorl %esi, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %bitOff = load i64, ptr %bitOff.ptr, align 1
@@ -402,106 +385,98 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_8bytes:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovnel %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: ashr_8bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ecx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: ashr_8bytes:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %eax, %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovnel %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_8bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ecx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovnel %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i64, ptr %src.ptr, align 1
   %bitOff = load i64, ptr %bitOff.ptr, align 1
@@ -589,56 +564,59 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%esi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -648,47 +626,45 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%esi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%esi), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: lshr_16bytes:
@@ -698,47 +674,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -748,48 +724,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_16bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -877,60 +851,65 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $60, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $60, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -940,45 +919,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -987,50 +970,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $60, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, 28(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, 44(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 12(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $60, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1039,45 +1026,49 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %dl, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%esi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%esi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%esi), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %eax, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1166,58 +1157,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%esi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1227,50 +1221,48 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%esi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%esi), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: ashr_16bytes:
@@ -1281,49 +1273,49 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1333,51 +1325,49 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_16bytes:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %dl, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%ebp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%ebp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $12, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 8(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %src = load i128, ptr %src.ptr, align 1
   %bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1552,119 +1542,107 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%edi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1677,72 +1655,61 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $76, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edx,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edx,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edx,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -1755,97 +1722,93 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebx,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ebx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%ebx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1858,73 +1821,62 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $76, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edx,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2110,120 +2062,116 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $28, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $28, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 20(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2236,73 +2184,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $76, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2315,108 +2254,90 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %dl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $28, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movsbl %cl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 76(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, 92(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2429,75 +2350,64 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $76, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 12(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %esi, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2683,126 +2593,113 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp,4), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2815,78 +2712,67 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $76, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edx,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edx,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edx,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edx,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx,4), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2899,100 +2785,99 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi,4), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 16(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ebx,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ebx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 12(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%ebx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3005,30 +2890,24 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $76, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -3038,46 +2917,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 16(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 20(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edx,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%edx,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3423,256 +3297,211 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 40(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 44(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 28(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3685,153 +3514,109 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 32(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3844,205 +3629,174 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 44(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 32(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 12(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 16(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4055,151 +3809,109 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $156, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 44(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $156, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4546,258 +4258,212 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%edx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 40(%edx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 44(%edx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 32(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 12(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -4810,153 +4476,113 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %ebp, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %edi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    negl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 160(%esp,%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 48(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -4969,214 +4595,169 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 140(%esp,%ebp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 60(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%ebp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5189,155 +4770,112 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ebx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%edx), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %ebx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 112(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 176(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxl %ecx, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -5693,67 +5231,45 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
-; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ecx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5771,195 +5287,172 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 40(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 44(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 28(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -5972,165 +5465,121 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, 56(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 44(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 12(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%edx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 52(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -6143,217 +5592,186 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ecx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $bl killed $bl killed $ebx def $ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 44(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 32(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 36(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 28(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 12(%ebp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 16(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -6366,163 +5784,121 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%edx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 56(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%edx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 52(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 44(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 32(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 24(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 16(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%edx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 60(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $156, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index fde915247760a..99de88d1d0965 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -28,25 +28,25 @@ define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NEXT:    movzbl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shll $3, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzbl (%ecx), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzbl (%edx), %edx
-; X86-BMI2-NEXT:    shll $3, %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
-; X86-BMI2-NEXT:    movb %cl, (%eax)
+; X86-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %init1 = load i8, ptr %src, align 1
   %intermediate.sroa.0.0.vec.insert = insertelement <2 x i8> <i8 poison, i8 0>, i8 %init1, i64 0
@@ -81,25 +81,25 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shll $3, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzwl (%ecx), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    shll $3, %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
-; X86-BMI2-NEXT:    movb %cl, (%eax)
+; X86-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %init = load <2 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -136,24 +136,24 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movw %ax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shll $3, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzwl (%ecx), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    shll $3, %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
-; X86-BMI2-NEXT:    movw %cx, (%eax)
+; X86-BMI2-NEXT:    movw %ax, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %init = load <2 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -188,49 +188,43 @@ define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl (%eax), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movl (%edx), %edx
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorl %esi, %esi
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    xorl %edx, %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SHLD-NEXT:    testb $32, %cl
-; X86-SHLD-NEXT:    cmovnel %esi, %edx
-; X86-SHLD-NEXT:    movb %dl, (%eax)
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    cmovnel %edx, %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movb %al, (%ecx)
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -265,49 +259,43 @@ define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl (%eax), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movl (%edx), %edx
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorl %esi, %esi
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    xorl %edx, %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SHLD-NEXT:    testb $32, %cl
-; X86-SHLD-NEXT:    cmovnel %esi, %edx
-; X86-SHLD-NEXT:    movw %dx, (%eax)
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    cmovnel %edx, %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movw %ax, (%ecx)
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %si, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -341,49 +329,43 @@ define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl (%eax), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movl (%edx), %edx
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorl %esi, %esi
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    xorl %edx, %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SHLD-NEXT:    testb $32, %cl
-; X86-SHLD-NEXT:    cmovnel %esi, %edx
-; X86-SHLD-NEXT:    movl %edx, (%eax)
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    cmovnel %edx, %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %edx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -434,86 +416,80 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
-; X86-SHLD-NEXT:    subl $40, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $44, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-SHLD-NEXT:    movb %bl, (%eax)
-; X86-SHLD-NEXT:    addl $40, %esp
-; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $44, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -566,86 +542,80 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $40, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $44, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movw %si, (%eax)
-; X86-SHLD-NEXT:    addl $40, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $44, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -697,86 +667,80 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $40, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $44, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, (%eax)
-; X86-SHLD-NEXT:    addl $40, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $44, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -828,125 +792,120 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $32, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $36, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-SHLD-NEXT:    movl %edx, %ebx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl 8(%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl (%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-SHLD-NEXT:    movl %eax, %edi
 ; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-SHLD-NEXT:    movl %edi, (%eax)
-; X86-SHLD-NEXT:    addl $32, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-SHLD-NEXT:    movl %esi, (%edx)
+; X86-SHLD-NEXT:    addl $36, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
-; X86-SHLD-NEXT:    popl %ebx
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movdqa %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -981,86 +940,80 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
-; X86-SHLD-NEXT:    subl $72, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $76, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-SHLD-NEXT:    movb %bl, (%eax)
-; X86-SHLD-NEXT:    addl $72, %esp
-; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $76, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1126,86 +1079,80 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $72, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $76, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movw %si, (%eax)
-; X86-SHLD-NEXT:    addl $72, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $76, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1270,86 +1217,80 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $72, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $76, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, (%eax)
-; X86-SHLD-NEXT:    addl $72, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $76, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1430,125 +1371,120 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $64, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $64, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $64, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $68, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-SHLD-NEXT:    movl %edx, %ebx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl 8(%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-SHLD-NEXT:    movl %eax, %edi
 ; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-SHLD-NEXT:    movl %edi, (%eax)
-; X86-SHLD-NEXT:    addl $64, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-SHLD-NEXT:    movl %esi, (%edx)
+; X86-SHLD-NEXT:    addl $68, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
-; X86-SHLD-NEXT:    popl %ebx
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $64, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $64, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1687,60 +1623,65 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%esi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1749,45 +1690,44 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebp
 ; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $92, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $64, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movups (%eax), %xmm0
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl %ecx, %eax
 ; X86-SHLD-NEXT:    shrb $5, %al
-; X86-SHLD-NEXT:    movzbl %al, %ebx
-; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
-; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
-; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
-; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
-; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
-; X86-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    movzbl %al, %edx
+; X86-SHLD-NEXT:    movl 16(%esp,%edx,4), %eax
+; X86-SHLD-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl %esi, %edi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edi, 12(%eax)
+; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-SHLD-NEXT:    movl %edi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-SHLD-NEXT:    movl %eax, (%edx)
-; X86-SHLD-NEXT:    addl $92, %esp
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $64, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
 ; X86-SHLD-NEXT:    popl %ebx
-; X86-SHLD-NEXT:    popl %ebp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1797,53 +1737,52 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%edx,4), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ebx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1923,95 +1862,89 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
-; X86-SHLD-NEXT:    subl $136, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    subl $140, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
-; X86-SHLD-NEXT:    andl $60, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-SHLD-NEXT:    movb %bl, (%eax)
-; X86-SHLD-NEXT:    addl $136, %esp
-; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $140, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2087,95 +2020,89 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $136, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    subl $140, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
-; X86-SHLD-NEXT:    andl $60, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movw %si, (%eax)
-; X86-SHLD-NEXT:    addl $136, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $140, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2250,95 +2177,89 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $136, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    subl $140, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
-; X86-SHLD-NEXT:    andl $60, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, (%eax)
-; X86-SHLD-NEXT:    addl $136, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $140, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $136, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2439,43 +2360,44 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%ebx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%ebx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -2485,86 +2407,82 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $128, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $132, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %esi
-; X86-SHLD-NEXT:    andl $60, %esi
-; X86-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-SHLD-NEXT:    movl (%esp,%esi), %edx
-; X86-SHLD-NEXT:    movl 4(%esp,%esi), %esi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl %ecx, %edx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %eax
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    andl $24, %ecx
-; X86-SHLD-NEXT:    movl %esi, %ebx
-; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl %edx, %edi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SHLD-NEXT:    movl %edi, 4(%esi)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
-; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-SHLD-NEXT:    movl %edx, (%eax)
-; X86-SHLD-NEXT:    addl $128, %esp
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
+; X86-SHLD-NEXT:    movl %eax, (%esi)
+; X86-SHLD-NEXT:    addl $132, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
-; X86-SHLD-NEXT:    popl %ebx
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $128, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2730,65 +2648,66 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%edi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -2797,50 +2716,49 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebp
 ; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $156, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $128, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edi
-; X86-SHLD-NEXT:    andl $60, %edi
-; X86-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-SHLD-NEXT:    movl 16(%esp,%edi), %eax
-; X86-SHLD-NEXT:    movl 20(%esp,%edi), %ebx
-; X86-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl 16(%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 12(%esp,%eax), %esi
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    andl $24, %ecx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-SHLD-NEXT:    movl 28(%esp,%edi), %ebp
-; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-SHLD-NEXT:    movl 32(%esp,%edi), %edi
+; X86-SHLD-NEXT:    movl %esi, %edi
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
-; X86-SHLD-NEXT:    movl %esi, 8(%edx)
+; X86-SHLD-NEXT:    movl %edi, 12(%edx)
+; X86-SHLD-NEXT:    movl 8(%esp,%eax), %edi
+; X86-SHLD-NEXT:    movl %edi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-SHLD-NEXT:    movl (%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-SHLD-NEXT:    movl %eax, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
 ; X86-SHLD-NEXT:    movl %ebx, 4(%edx)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-SHLD-NEXT:    movl %eax, (%edx)
-; X86-SHLD-NEXT:    addl $156, %esp
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-SHLD-NEXT:    movl %esi, (%edx)
+; X86-SHLD-NEXT:    addl $128, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
 ; X86-SHLD-NEXT:    popl %ebx
-; X86-SHLD-NEXT:    popl %ebp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2849,52 +2767,51 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 16(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ecx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3127,105 +3044,106 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ebp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -3234,69 +3152,65 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ;
 ; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebp
 ; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $156, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $128, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edi
-; X86-SHLD-NEXT:    andl $60, %edi
-; X86-SHLD-NEXT:    movl 24(%esp,%edi), %edx
-; X86-SHLD-NEXT:    movl 20(%esp,%edi), %esi
-; X86-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl 32(%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 28(%esp,%eax), %esi
 ; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    andl $24, %ecx
-; X86-SHLD-NEXT:    movl %edx, %eax
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    movl 28(%esp,%edi), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X86-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SHLD-NEXT:    movl 36(%esp,%edi), %esi
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X86-SHLD-NEXT:    movl 40(%esp,%edi), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl 44(%esp,%edi), %eax
-; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-SHLD-NEXT:    movl 48(%esp,%edi), %edi
-; X86-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SHLD-NEXT:    movl %eax, 28(%edi)
-; X86-SHLD-NEXT:    movl %edx, 24(%edi)
-; X86-SHLD-NEXT:    movl %esi, 20(%edi)
-; X86-SHLD-NEXT:    movl %ebp, 16(%edi)
-; X86-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    movl %eax, 12(%edi)
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    movl %eax, 8(%edi)
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-SHLD-NEXT:    movl %esi, %edi
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl %edi, 28(%edx)
+; X86-SHLD-NEXT:    movl 24(%esp,%eax), %edi
+; X86-SHLD-NEXT:    movl %edi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 24(%edx)
+; X86-SHLD-NEXT:    movl 20(%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl %esi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 20(%edx)
+; X86-SHLD-NEXT:    movl 16(%esp,%eax), %edi
+; X86-SHLD-NEXT:    movl %edi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 16(%edx)
+; X86-SHLD-NEXT:    movl 12(%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl %esi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 12(%edx)
+; X86-SHLD-NEXT:    movl 8(%esp,%eax), %edi
+; X86-SHLD-NEXT:    movl %edi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-SHLD-NEXT:    movl (%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-SHLD-NEXT:    movl %eax, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 4(%edx)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-SHLD-NEXT:    movl %ebx, (%edi)
-; X86-SHLD-NEXT:    addl $156, %esp
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-SHLD-NEXT:    movl %esi, (%edx)
+; X86-SHLD-NEXT:    addl $128, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
 ; X86-SHLD-NEXT:    popl %ebx
-; X86-SHLD-NEXT:    popl %ebp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -3305,84 +3219,75 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $172, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 32(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 24(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $172, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%ecx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $140, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index bed8e5806380c..6829b4586d85b 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -30,25 +30,25 @@ define void @load_1byte_chunk_of_2byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shll $3, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    movzwl (%ecx), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, %ecx, %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    movzwl (%edx), %edx
-; X86-BMI2-NEXT:    shll $3, %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
-; X86-BMI2-NEXT:    movb %cl, (%eax)
+; X86-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %init = load <2 x i8>, ptr %src, align 1
   %intermediate.val.frozen = freeze <2 x i8> %init
@@ -83,24 +83,24 @@ define void @load_1byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shll $3, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shll $3, %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
-; X86-BMI2-NEXT:    movb %cl, (%eax)
+; X86-BMI2-NEXT:    movb %al, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.val.frozen = freeze <4 x i8> %init
@@ -134,23 +134,23 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca:
 ; X86-NO-BMI2:       # %bb.0:
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NEXT:    movl (%eax), %eax
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl (%edx), %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NEXT:    movw %dx, (%eax)
+; X86-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movw %ax, (%ecx)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca:
 ; X86-BMI2:       # %bb.0:
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI2-NEXT:    shll $3, %eax
+; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI2-NEXT:    shrxl %eax, (%ecx), %eax
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI2-NEXT:    shll $3, %ecx
-; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
-; X86-BMI2-NEXT:    movw %cx, (%eax)
+; X86-BMI2-NEXT:    movw %ax, (%ecx)
 ; X86-BMI2-NEXT:    retl
   %init = load <4 x i8>, ptr %src, align 1
   %intermediate.val.frozen = freeze <4 x i8> %init
@@ -184,88 +184,80 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, (%ecx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %al, (%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -297,88 +289,80 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %dx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%ecx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %dx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %ax, (%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -409,88 +393,80 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
 ; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
 ; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %eax, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
   %init = load <8 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -576,83 +552,77 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
-; X86-SHLD-NEXT:    subl $40, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $44, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-SHLD-NEXT:    movb %bl, (%eax)
-; X86-SHLD-NEXT:    addl $40, %esp
-; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $44, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -738,83 +708,77 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $40, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $44, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movw %si, (%eax)
-; X86-SHLD-NEXT:    addl $40, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $44, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -899,83 +863,77 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $40, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $44, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, (%eax)
-; X86-SHLD-NEXT:    addl $40, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $44, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -1060,122 +1018,117 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $32, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $36, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $3, %dl
-; X86-SHLD-NEXT:    andb $12, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
-; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-SHLD-NEXT:    movl %edx, %ebx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $3, %al
+; X86-SHLD-NEXT:    andb $12, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl 8(%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl (%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%eax), %eax
+; X86-SHLD-NEXT:    movl %eax, %edi
 ; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-SHLD-NEXT:    movl %edi, (%eax)
-; X86-SHLD-NEXT:    addl $32, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-SHLD-NEXT:    movl %esi, (%edx)
+; X86-SHLD-NEXT:    addl $36, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
-; X86-SHLD-NEXT:    popl %ebx
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $32, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <16 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -1211,89 +1164,83 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
-; X86-SHLD-NEXT:    subl $72, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $76, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-SHLD-NEXT:    movb %bl, (%eax)
-; X86-SHLD-NEXT:    addl $72, %esp
-; X86-SHLD-NEXT:    popl %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movb %dl, (%eax)
+; X86-SHLD-NEXT:    addl $76, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -1359,89 +1306,83 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $72, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $76, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movw %si, (%eax)
-; X86-SHLD-NEXT:    addl $72, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movw %dx, (%eax)
+; X86-SHLD-NEXT:    addl $76, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -1506,89 +1447,83 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $72, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $76, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, (%eax)
-; X86-SHLD-NEXT:    addl $72, %esp
-; X86-SHLD-NEXT:    popl %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    addl $76, %esp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%ecx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -1670,128 +1605,123 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $64, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $64, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $64, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $68, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edx
-; X86-SHLD-NEXT:    shrb $5, %dl
-; X86-SHLD-NEXT:    movzbl %dl, %edx
-; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
-; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
-; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-SHLD-NEXT:    movl %edx, %ebx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
+; X86-SHLD-NEXT:    movl %ecx, %eax
+; X86-SHLD-NEXT:    shrb $5, %al
+; X86-SHLD-NEXT:    movzbl %al, %eax
+; X86-SHLD-NEXT:    movl 8(%esp,%eax,4), %edx
+; X86-SHLD-NEXT:    movl (%esp,%eax,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%eax,4), %eax
+; X86-SHLD-NEXT:    movl %eax, %edi
 ; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-SHLD-NEXT:    movl %edi, (%eax)
-; X86-SHLD-NEXT:    addl $64, %esp
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %esi
+; X86-SHLD-NEXT:    movl %esi, (%edx)
+; X86-SHLD-NEXT:    addl $68, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
-; X86-SHLD-NEXT:    popl %ebx
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
 ; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $64, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 4(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $64, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
   %init = load <32 x i8>, ptr %src, align 1
   %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -1932,61 +1862,66 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%esi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 4(%esi)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esi)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
@@ -1995,46 +1930,45 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ;
 ; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
 ; X86-SHLD:       # %bb.0:
-; X86-SHLD-NEXT:    pushl %ebp
 ; X86-SHLD-NEXT:    pushl %ebx
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
-; X86-SHLD-NEXT:    subl $92, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    subl $64, %esp
+; X86-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movups (%eax), %xmm0
 ; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    shll $3, %ecx
 ; X86-SHLD-NEXT:    movl %ecx, %eax
 ; X86-SHLD-NEXT:    shrb $5, %al
-; X86-SHLD-NEXT:    movzbl %al, %ebx
-; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
-; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
-; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
-; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
-; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
-; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
-; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
-; X86-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-SHLD-NEXT:    movl %edi, 4(%edx)
+; X86-SHLD-NEXT:    movzbl %al, %edx
+; X86-SHLD-NEXT:    movl 16(%esp,%edx,4), %eax
+; X86-SHLD-NEXT:    movl 12(%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl %esi, %edi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edi, 12(%eax)
+; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %edi
+; X86-SHLD-NEXT:    movl %edi, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 8(%eax)
+; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-SHLD-NEXT:    movl %eax, (%edx)
-; X86-SHLD-NEXT:    addl $92, %esp
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT:    movl %esi, (%eax)
+; X86-SHLD-NEXT:    addl $64, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
 ; X86-SHLD-NEXT:    popl %ebx
-; X86-SHLD-NEXT:    popl %ebp
 ; X86-SHLD-NEXT:    retl
 ;
 ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
@@ -2044,54 +1978,53 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%esi,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebp, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%edx,4), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%ebx)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/widen_arith-1.ll b/llvm/test/CodeGen/X86/widen_arith-1.ll
index e08df8f76d1a6..08d974d5f3471 100644
--- a/llvm/test/CodeGen/X86/widen_arith-1.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-1.ll
@@ -17,9 +17,9 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    psubb %xmm0, %xmm1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    pextrb $2, %xmm1, 2(%ecx,%eax,4)
 ; CHECK-NEXT:    pextrw $0, %xmm1, (%ecx,%eax,4)
 ; CHECK-NEXT:    incl (%esp)
diff --git a/llvm/test/CodeGen/X86/widen_arith-3.ll b/llvm/test/CodeGen/X86/widen_arith-3.ll
index 9031588f2690d..3ac5b669ee72f 100644
--- a/llvm/test/CodeGen/X86/widen_arith-3.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-3.ll
@@ -26,10 +26,10 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl 12(%ebp), %edx
-; CHECK-NEXT:    movl 8(%ebp), %ecx
+; CHECK-NEXT:    movl 12(%ebp), %ecx
 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    pinsrw $2, 4(%edx,%eax,8), %xmm1
+; CHECK-NEXT:    pinsrw $2, 4(%ecx,%eax,8), %xmm1
+; CHECK-NEXT:    movl 8(%ebp), %ecx
 ; CHECK-NEXT:    psubw %xmm0, %xmm1
 ; CHECK-NEXT:    pextrw $2, %xmm1, 4(%ecx,%eax,8)
 ; CHECK-NEXT:    movd %xmm1, (%ecx,%eax,8)
diff --git a/llvm/test/CodeGen/X86/widen_arith-6.ll b/llvm/test/CodeGen/X86/widen_arith-6.ll
index 6fa232f4d3227..f8420d71b3c39 100644
--- a/llvm/test/CodeGen/X86/widen_arith-6.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-6.ll
@@ -24,12 +24,12 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl 8(%ebp), %ecx
 ; CHECK-NEXT:    shll $4, %eax
-; CHECK-NEXT:    movl 12(%ebp), %edx
-; CHECK-NEXT:    movaps (%edx,%eax), %xmm1
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    movaps (%ecx,%eax), %xmm1
 ; CHECK-NEXT:    mulps {{[0-9]+}}(%esp), %xmm1
 ; CHECK-NEXT:    addps %xmm0, %xmm1
+; CHECK-NEXT:    movl 8(%ebp), %ecx
 ; CHECK-NEXT:    extractps $2, %xmm1, 8(%ecx,%eax)
 ; CHECK-NEXT:    extractps $1, %xmm1, 4(%ecx,%eax)
 ; CHECK-NEXT:    movss %xmm1, (%ecx,%eax)
diff --git a/llvm/test/CodeGen/X86/widen_cast-1.ll b/llvm/test/CodeGen/X86/widen_cast-1.ll
index 566dde0ca13d3..d53fa7242ca50 100644
--- a/llvm/test/CodeGen/X86/widen_cast-1.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-1.ll
@@ -19,9 +19,9 @@ define void @convert(ptr %dst, ptr %src) nounwind {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    psubw %xmm0, %xmm1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movq %xmm1, (%ecx,%eax,8)
 ; CHECK-NEXT:    incl (%esp)
 ; CHECK-NEXT:    cmpl $3, (%esp)
diff --git a/llvm/test/CodeGen/X86/widen_cast-2.ll b/llvm/test/CodeGen/X86/widen_cast-2.ll
index cd06f27dcc55c..d35e92a670c0f 100644
--- a/llvm/test/CodeGen/X86/widen_cast-2.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-2.ll
@@ -14,14 +14,14 @@ define void @convert(ptr %dst, ptr %src) nounwind {
 ; CHECK-NEXT:  .LBB0_2: # %forbody
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    shll $5, %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movdqa (%edx,%eax), %xmm1
-; CHECK-NEXT:    movdqa 16(%edx,%eax), %xmm2
-; CHECK-NEXT:    psubw %xmm0, %xmm2
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movdqa (%ecx,%eax), %xmm1
+; CHECK-NEXT:    movdqa 16(%ecx,%eax), %xmm2
 ; CHECK-NEXT:    psubw %xmm0, %xmm1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movdqa %xmm1, (%ecx,%eax)
+; CHECK-NEXT:    psubw %xmm0, %xmm2
 ; CHECK-NEXT:    movd %xmm2, 16(%ecx,%eax)
 ; CHECK-NEXT:    pextrd $1, %xmm2, 20(%ecx,%eax)
 ; CHECK-NEXT:    pextrd $2, %xmm2, 24(%ecx,%eax)
diff --git a/llvm/test/CodeGen/X86/widen_cast-3.ll b/llvm/test/CodeGen/X86/widen_cast-3.ll
index badf8c3ed2ecf..4e22bd351c4b9 100644
--- a/llvm/test/CodeGen/X86/widen_cast-3.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-3.ll
@@ -7,9 +7,9 @@
 define void @convert(ptr %dst.addr, <3 x i32> %src) nounwind {
 ; X86-LABEL: convert:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    psubd %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pextrd $2, %xmm0, 8(%eax)
 ; X86-NEXT:    pextrd $1, %xmm0, 4(%eax)
 ; X86-NEXT:    movd %xmm0, (%eax)
diff --git a/llvm/test/CodeGen/X86/widen_cast-5.ll b/llvm/test/CodeGen/X86/widen_cast-5.ll
index ed2e9d818f259..828242e445f44 100644
--- a/llvm/test/CodeGen/X86/widen_cast-5.ll
+++ b/llvm/test/CodeGen/X86/widen_cast-5.ll
@@ -7,9 +7,9 @@
 define void @convert(ptr %dst.addr, i64 %src) nounwind {
 ; X86-LABEL: convert:
 ; X86:       ## %bb.0: ## %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/widen_compare-1.ll b/llvm/test/CodeGen/X86/widen_compare-1.ll
index 6daa33c200b90..117def7ff252f 100644
--- a/llvm/test/CodeGen/X86/widen_compare-1.ll
+++ b/llvm/test/CodeGen/X86/widen_compare-1.ll
@@ -24,9 +24,9 @@ define <2 x i16> @compare_v2i64_to_v2i16_binary(ptr %src0, ptr %src1) nounwind {
 ; X86-LABEL: compare_v2i64_to_v2i16_binary:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    pmaxuw %xmm1, %xmm0
 ; X86-NEXT:    pcmpeqw %xmm1, %xmm0
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/widen_conv-1.ll b/llvm/test/CodeGen/X86/widen_conv-1.ll
index 2b37697d414b7..74ddbe8f4b2b7 100644
--- a/llvm/test/CodeGen/X86/widen_conv-1.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-1.ll
@@ -7,10 +7,10 @@
 define void @convert_v2i64_to_v2i32(ptr %dst.addr, <2 x i64> %src) nounwind {
 ; X86-LABEL: convert_v2i64_to_v2i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    psubd %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -34,11 +34,11 @@ define void @convert_v3i32_to_v3i8(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-LABEL: convert_v3i32_to_v3i8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa (%eax), %xmm0
 ; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    psubb %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pextrb $2, %xmm0, 2(%eax)
 ; X86-NEXT:    pextrw $0, %xmm0, (%eax)
 ; X86-NEXT:    retl
@@ -66,11 +66,11 @@ define void @convert_v5i16_to_v5i8(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-LABEL: convert_v5i16_to_v5i8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa (%eax), %xmm0
 ; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    psubb %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pextrb $4, %xmm0, 4(%eax)
 ; X86-NEXT:    movd %xmm0, (%eax)
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/widen_conv-2.ll b/llvm/test/CodeGen/X86/widen_conv-2.ll
index 71f00ffa5645a..684d70debbc4f 100644
--- a/llvm/test/CodeGen/X86/widen_conv-2.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-2.ll
@@ -7,8 +7,8 @@
 define void @convert_v2i16_v2i32(ptr %dst.addr, <2 x i16> %src) nounwind {
 ; X86-LABEL: convert_v2i16_v2i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pmovsxwd %xmm0, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll
index f9b588b8b8915..bd98b8681f09f 100644
--- a/llvm/test/CodeGen/X86/widen_conv-3.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-3.ll
@@ -9,18 +9,18 @@
 define void @convert_v2i16_to_v2f32(ptr %dst.addr, <2 x i16> %src) nounwind {
 ; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
 ; X86-SSE2:       # %bb.0: # %entry
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X86-SSE2-NEXT:    psrad $16, %xmm0
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
 ; X86-SSE42:       # %bb.0: # %entry
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    pmovsxwd %xmm0, %xmm0
 ; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    movlps %xmm0, (%eax)
 ; X86-SSE42-NEXT:    retl
 ;
@@ -50,13 +50,12 @@ define void @convert_v3i8_to_v3f32(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movzwl (%ecx), %edx
-; X86-SSE2-NEXT:    movd %edx, %xmm0
+; X86-SSE2-NEXT:    movzwl (%eax), %ecx
+; X86-SSE2-NEXT:    movd %ecx, %xmm0
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
-; X86-SSE2-NEXT:    movd %ecx, %xmm2
+; X86-SSE2-NEXT:    movzbl 2(%eax), %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm2
 ; X86-SSE2-NEXT:    pslld $16, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
 ; X86-SSE2-NEXT:    por %xmm0, %xmm1
@@ -64,6 +63,7 @@ define void @convert_v3i8_to_v3f32(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X86-SSE2-NEXT:    psrad $24, %xmm0
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movss %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
 ; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
@@ -75,12 +75,12 @@ define void @convert_v3i8_to_v3f32(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
 ; X86-SSE42:       # %bb.0: # %entry
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movzwl (%ecx), %edx
-; X86-SSE42-NEXT:    movd %edx, %xmm0
-; X86-SSE42-NEXT:    pinsrb $2, 2(%ecx), %xmm0
+; X86-SSE42-NEXT:    movzwl (%eax), %ecx
+; X86-SSE42-NEXT:    movd %ecx, %xmm0
+; X86-SSE42-NEXT:    pinsrb $2, 2(%eax), %xmm0
 ; X86-SSE42-NEXT:    pmovsxbd %xmm0, %xmm0
 ; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    extractps $2, %xmm0, 8(%eax)
 ; X86-SSE42-NEXT:    extractps $1, %xmm0, 4(%eax)
 ; X86-SSE42-NEXT:    movss %xmm0, (%eax)
diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll
index d4555207d5559..897fb326e541b 100644
--- a/llvm/test/CodeGen/X86/widen_conv-4.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-4.ll
@@ -9,32 +9,32 @@
 define void @convert_v7i16_v7f32(ptr %dst.addr, <7 x i16> %src) nounwind {
 ; X86-SSE2-LABEL: convert_v7i16_v7f32:
 ; X86-SSE2:       # %bb.0: # %entry
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movups %xmm2, (%eax)
+; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE2-NEXT:    movups %xmm0, (%eax)
-; X86-SSE2-NEXT:    movss %xmm2, 16(%eax)
-; X86-SSE2-NEXT:    movaps %xmm2, %xmm0
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; X86-SSE2-NEXT:    movss %xmm0, 24(%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    movss %xmm2, 20(%eax)
+; X86-SSE2-NEXT:    movss %xmm0, 16(%eax)
+; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
+; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; X86-SSE2-NEXT:    movss %xmm1, 24(%eax)
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    movss %xmm0, 20(%eax)
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: convert_v7i16_v7f32:
 ; X86-SSE42:       # %bb.0: # %entry
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X86-SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE42-NEXT:    cvtdq2ps %xmm2, %xmm1
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    extractps $2, %xmm0, 24(%eax)
 ; X86-SSE42-NEXT:    extractps $1, %xmm0, 20(%eax)
+; X86-SSE42-NEXT:    cvtdq2ps %xmm2, %xmm1
 ; X86-SSE42-NEXT:    movups %xmm1, (%eax)
 ; X86-SSE42-NEXT:    movss %xmm0, 16(%eax)
 ; X86-SSE42-NEXT:    retl
@@ -76,13 +76,12 @@ define void @convert_v3i8_to_v3f32(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movzwl (%ecx), %edx
-; X86-SSE2-NEXT:    movd %edx, %xmm0
+; X86-SSE2-NEXT:    movzwl (%eax), %ecx
+; X86-SSE2-NEXT:    movd %ecx, %xmm0
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
-; X86-SSE2-NEXT:    movd %ecx, %xmm2
+; X86-SSE2-NEXT:    movzbl 2(%eax), %eax
+; X86-SSE2-NEXT:    movd %eax, %xmm2
 ; X86-SSE2-NEXT:    pslld $16, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
 ; X86-SSE2-NEXT:    por %xmm0, %xmm1
@@ -90,6 +89,7 @@ define void @convert_v3i8_to_v3f32(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; X86-SSE2-NEXT:    cvtdq2ps %xmm1, %xmm0
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movss %xmm0, (%eax)
 ; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
 ; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
@@ -101,12 +101,12 @@ define void @convert_v3i8_to_v3f32(ptr %dst.addr, ptr %src.addr) nounwind {
 ; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
 ; X86-SSE42:       # %bb.0: # %entry
 ; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE42-NEXT:    movzwl (%ecx), %edx
-; X86-SSE42-NEXT:    movd %edx, %xmm0
-; X86-SSE42-NEXT:    pinsrb $2, 2(%ecx), %xmm0
+; X86-SSE42-NEXT:    movzwl (%eax), %ecx
+; X86-SSE42-NEXT:    movd %ecx, %xmm0
+; X86-SSE42-NEXT:    pinsrb $2, 2(%eax), %xmm0
 ; X86-SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE42-NEXT:    extractps $2, %xmm0, 8(%eax)
 ; X86-SSE42-NEXT:    extractps $1, %xmm0, 4(%eax)
 ; X86-SSE42-NEXT:    movss %xmm0, (%eax)
diff --git a/llvm/test/CodeGen/X86/widen_extract-1.ll b/llvm/test/CodeGen/X86/widen_extract-1.ll
index 3634e6718cd9e..9a6ddff3c43c9 100644
--- a/llvm/test/CodeGen/X86/widen_extract-1.ll
+++ b/llvm/test/CodeGen/X86/widen_extract-1.ll
@@ -7,8 +7,8 @@
 define void @convert(ptr %dst.addr, <3 x double> %src)  {
 ; X32-LABEL: convert:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movaps %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/widen_load-0.ll b/llvm/test/CodeGen/X86/widen_load-0.ll
index 4596d1a1ea233..2e1635e8a60c7 100644
--- a/llvm/test/CodeGen/X86/widen_load-0.ll
+++ b/llvm/test/CodeGen/X86/widen_load-0.ll
@@ -11,11 +11,11 @@ define void @short2_int_swap(ptr nocapture %b, ptr nocapture %c) nounwind {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl (%eax), %esi
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll
index 4d0a5da4e7cb6..c0bdbcbd0170d 100644
--- a/llvm/test/CodeGen/X86/widen_load-2.ll
+++ b/llvm/test/CodeGen/X86/widen_load-2.ll
@@ -11,8 +11,8 @@ define void @add3i32(ptr sret(%i32vec3) %ret, ptr %ap, ptr %bp)  {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa (%edx), %xmm0
+; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    paddd (%ecx), %xmm0
 ; X86-NEXT:    pextrd $2, %xmm0, 8(%eax)
 ; X86-NEXT:    pextrd $1, %xmm0, 4(%eax)
@@ -39,10 +39,10 @@ define void @add3i32_2(ptr sret(%i32vec3) %ret, ptr %ap, ptr %bp)  {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pinsrd $1, 4(%edx), %xmm0
-; X86-NEXT:    pinsrd $2, 8(%edx), %xmm0
+; X86-NEXT:    pinsrd $1, 4(%ecx), %xmm0
+; X86-NEXT:    pinsrd $2, 8(%ecx), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    pinsrd $1, 4(%ecx), %xmm1
 ; X86-NEXT:    pinsrd $2, 8(%ecx), %xmm1
@@ -76,9 +76,9 @@ define void @add7i32(ptr sret(%i32vec7) %ret, ptr %ap, ptr %bp)  {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa (%edx), %xmm0
-; X86-NEXT:    movdqa 16(%edx), %xmm1
+; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa 16(%ecx), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    paddd (%ecx), %xmm0
 ; X86-NEXT:    paddd 16(%ecx), %xmm1
 ; X86-NEXT:    movd %xmm1, 16(%eax)
@@ -111,16 +111,16 @@ define void @add12i32(ptr sret(%i32vec12) %ret, ptr %ap, ptr %bp)  {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa 16(%ecx), %xmm1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa 32(%edx), %xmm0
-; X86-NEXT:    movdqa (%edx), %xmm1
-; X86-NEXT:    movdqa 16(%edx), %xmm2
-; X86-NEXT:    paddd (%ecx), %xmm1
-; X86-NEXT:    paddd 32(%ecx), %xmm0
-; X86-NEXT:    paddd 16(%ecx), %xmm2
-; X86-NEXT:    movdqa %xmm2, 16(%eax)
-; X86-NEXT:    movdqa %xmm0, 32(%eax)
-; X86-NEXT:    movdqa %xmm1, (%eax)
+; X86-NEXT:    paddd (%edx), %xmm0
+; X86-NEXT:    movdqa 32(%ecx), %xmm2
+; X86-NEXT:    paddd 32(%edx), %xmm2
+; X86-NEXT:    paddd 16(%edx), %xmm1
+; X86-NEXT:    movdqa %xmm1, 16(%eax)
+; X86-NEXT:    movdqa %xmm2, 32(%eax)
+; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add12i32:
@@ -150,9 +150,9 @@ define void @add3i16(ptr nocapture sret(%i16vec3) %ret, ptr %ap, ptr %bp) nounwi
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    pinsrw $2, 4(%edx), %xmm0
+; X86-NEXT:    pinsrw $2, 4(%ecx), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    pinsrw $2, 4(%ecx), %xmm1
 ; X86-NEXT:    paddw %xmm0, %xmm1
@@ -182,8 +182,8 @@ define void @add4i16(ptr nocapture sret(%i16vec4) %ret, ptr %ap, ptr %bp) nounwi
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    paddw %xmm0, %xmm1
 ; X86-NEXT:    movq %xmm1, (%eax)
@@ -210,9 +210,9 @@ define void @add12i16(ptr nocapture sret(%i16vec12) %ret, ptr %ap, ptr %bp) noun
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa (%edx), %xmm0
-; X86-NEXT:    movdqa 16(%edx), %xmm1
+; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa 16(%ecx), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    paddw (%ecx), %xmm0
 ; X86-NEXT:    paddw 16(%ecx), %xmm1
 ; X86-NEXT:    movd %xmm1, 16(%eax)
@@ -243,16 +243,16 @@ define void @add18i16(ptr nocapture sret(%i16vec18) %ret, ptr %ap, ptr %bp) noun
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa 16(%ecx), %xmm1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa 32(%edx), %xmm0
-; X86-NEXT:    movdqa (%edx), %xmm1
-; X86-NEXT:    movdqa 16(%edx), %xmm2
-; X86-NEXT:    paddw (%ecx), %xmm1
-; X86-NEXT:    paddw 32(%ecx), %xmm0
-; X86-NEXT:    paddw 16(%ecx), %xmm2
-; X86-NEXT:    movdqa %xmm2, 16(%eax)
-; X86-NEXT:    movd %xmm0, 32(%eax)
-; X86-NEXT:    movdqa %xmm1, (%eax)
+; X86-NEXT:    paddw (%edx), %xmm0
+; X86-NEXT:    movdqa 32(%ecx), %xmm2
+; X86-NEXT:    paddw 32(%edx), %xmm2
+; X86-NEXT:    paddw 16(%edx), %xmm1
+; X86-NEXT:    movdqa %xmm1, 16(%eax)
+; X86-NEXT:    movd %xmm2, 32(%eax)
+; X86-NEXT:    movdqa %xmm0, (%eax)
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add18i16:
@@ -282,8 +282,8 @@ define void @add3i8(ptr nocapture sret(%i8vec3) %ret, ptr %ap, ptr %bp) nounwind
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-NEXT:    paddb %xmm0, %xmm1
 ; X86-NEXT:    pextrb $2, %xmm1, 2(%eax)
@@ -312,9 +312,9 @@ define void @add31i8(ptr nocapture sret(%i8vec31) %ret, ptr %ap, ptr %bp) nounwi
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movdqa (%edx), %xmm0
-; X86-NEXT:    movdqa 16(%edx), %xmm1
+; X86-NEXT:    movdqa (%ecx), %xmm0
+; X86-NEXT:    movdqa 16(%ecx), %xmm1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    paddb (%ecx), %xmm0
 ; X86-NEXT:    paddb 16(%ecx), %xmm1
 ; X86-NEXT:    movd %xmm1, 16(%eax)
@@ -352,11 +352,11 @@ define void @rot(ptr nocapture sret(%i8vec3pack) %result, ptr %X, ptr %rot) noun
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb $-98, 2(%ecx)
+; X86-NEXT:    movw $-24930, (%ecx) # imm = 0x9E9E
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movb $-98, 2(%edx)
-; X86-NEXT:    movw $-24930, (%edx) # imm = 0x9E9E
-; X86-NEXT:    movb $1, 2(%ecx)
-; X86-NEXT:    movw $257, (%ecx) # imm = 0x101
+; X86-NEXT:    movb $1, 2(%edx)
+; X86-NEXT:    movw $257, (%edx) # imm = 0x101
 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-NEXT:    psrlw $1, %xmm0
 ; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
diff --git a/llvm/test/CodeGen/X86/widen_load-3.ll b/llvm/test/CodeGen/X86/widen_load-3.ll
index b3738f5366d3d..10578d5253644 100644
--- a/llvm/test/CodeGen/X86/widen_load-3.ll
+++ b/llvm/test/CodeGen/X86/widen_load-3.ll
@@ -13,14 +13,14 @@ define <7 x i64> @load7_aligned(ptr %x) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movaps (%ecx), %xmm0
-; X86-SSE-NEXT:    movaps 16(%ecx), %xmm1
-; X86-SSE-NEXT:    movaps 32(%ecx), %xmm2
+; X86-SSE-NEXT:    movl 52(%ecx), %edx
+; X86-SSE-NEXT:    movl %edx, 52(%eax)
 ; X86-SSE-NEXT:    movl 48(%ecx), %edx
-; X86-SSE-NEXT:    movl 52(%ecx), %ecx
-; X86-SSE-NEXT:    movl %ecx, 52(%eax)
 ; X86-SSE-NEXT:    movl %edx, 48(%eax)
-; X86-SSE-NEXT:    movaps %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movaps 32(%ecx), %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, 32(%eax)
+; X86-SSE-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE-NEXT:    movaps 16(%ecx), %xmm1
 ; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl $4
@@ -29,12 +29,12 @@ define <7 x i64> @load7_aligned(ptr %x) nounwind {
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    vmovaps 48(%ecx), %xmm0
+; X86-AVX-NEXT:    vextractps $1, %xmm0, 52(%eax)
+; X86-AVX-NEXT:    vmovss %xmm0, 48(%eax)
+; X86-AVX-NEXT:    vmovaps 32(%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, 32(%eax)
 ; X86-AVX-NEXT:    vmovaps (%ecx), %ymm0
-; X86-AVX-NEXT:    vmovaps 48(%ecx), %xmm1
-; X86-AVX-NEXT:    vextractps $1, %xmm1, 52(%eax)
-; X86-AVX-NEXT:    vmovss %xmm1, 48(%eax)
-; X86-AVX-NEXT:    vmovaps 32(%ecx), %xmm1
-; X86-AVX-NEXT:    vmovaps %xmm1, 32(%eax)
 ; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl $4
@@ -72,14 +72,14 @@ define <7 x i64> @load7_unaligned(ptr %x) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movups (%ecx), %xmm0
-; X86-SSE-NEXT:    movups 16(%ecx), %xmm1
-; X86-SSE-NEXT:    movups 32(%ecx), %xmm2
+; X86-SSE-NEXT:    movl 52(%ecx), %edx
+; X86-SSE-NEXT:    movl %edx, 52(%eax)
 ; X86-SSE-NEXT:    movl 48(%ecx), %edx
-; X86-SSE-NEXT:    movl 52(%ecx), %ecx
-; X86-SSE-NEXT:    movl %ecx, 52(%eax)
 ; X86-SSE-NEXT:    movl %edx, 48(%eax)
-; X86-SSE-NEXT:    movaps %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movups 32(%ecx), %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, 32(%eax)
+; X86-SSE-NEXT:    movups (%ecx), %xmm0
+; X86-SSE-NEXT:    movups 16(%ecx), %xmm1
 ; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl $4
@@ -88,13 +88,13 @@ define <7 x i64> @load7_unaligned(ptr %x) nounwind {
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    vmovups (%ecx), %ymm0
-; X86-AVX-NEXT:    vmovups 32(%ecx), %xmm1
+; X86-AVX-NEXT:    movl 52(%ecx), %edx
+; X86-AVX-NEXT:    movl %edx, 52(%eax)
 ; X86-AVX-NEXT:    movl 48(%ecx), %edx
-; X86-AVX-NEXT:    movl 52(%ecx), %ecx
-; X86-AVX-NEXT:    movl %ecx, 52(%eax)
 ; X86-AVX-NEXT:    movl %edx, 48(%eax)
-; X86-AVX-NEXT:    vmovaps %xmm1, 32(%eax)
+; X86-AVX-NEXT:    vmovups 32(%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, 32(%eax)
+; X86-AVX-NEXT:    vmovups (%ecx), %ymm0
 ; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl $4
@@ -133,21 +133,21 @@ define void @load_split(ptr %ld, ptr %st1, ptr %st2) nounwind {
 ; X86-SSE-LABEL: load_split:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movups (%edx), %xmm0
-; X86-SSE-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE-NEXT:    movups %xmm0, (%ecx)
+; X86-SSE-NEXT:    movups (%eax), %xmm0
+; X86-SSE-NEXT:    movups 16(%eax), %xmm1
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movups %xmm1, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: load_split:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    vmovups %xmm0, (%ecx)
+; X86-AVX-NEXT:    vmovups (%eax), %ymm0
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vextractf128 $1, %ymm0, (%eax)
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
@@ -179,26 +179,26 @@ define void @load_split_more(ptr %src, ptr %idx, ptr %dst) nounwind {
 ; X86-SSE-LABEL: load_split_more:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl (%eax), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SSE-NEXT:    movups (%edx), %xmm0
 ; X86-SSE-NEXT:    movups 16(%edx), %xmm1
-; X86-SSE-NEXT:    movl (%ecx), %edx
-; X86-SSE-NEXT:    movups %xmm0, (%eax,%edx,4)
-; X86-SSE-NEXT:    movl 4(%ecx), %ecx
-; X86-SSE-NEXT:    movups %xmm1, (%eax,%ecx,4)
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movups %xmm0, (%edx,%ecx,4)
+; X86-SSE-NEXT:    movl 4(%eax), %eax
+; X86-SSE-NEXT:    movups %xmm1, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: load_split_more:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    movl (%eax), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    movl (%ecx), %edx
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax,%edx,4)
-; X86-AVX-NEXT:    movl 4(%ecx), %ecx
-; X86-AVX-NEXT:    vextractf128 $1, %ymm0, (%eax,%ecx,4)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT:    vmovups %xmm0, (%edx,%ecx,4)
+; X86-AVX-NEXT:    movl 4(%eax), %eax
+; X86-AVX-NEXT:    vextractf128 $1, %ymm0, (%edx,%eax,4)
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
index 3d34205096afe..4292423d90c93 100644
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -6,8 +6,8 @@
 define void @shuf(ptr %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
 ; X86-LABEL: shuf:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addps %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    extractps $2, %xmm0, 8(%eax)
 ; X86-NEXT:    extractps $1, %xmm0, 4(%eax)
 ; X86-NEXT:    movss %xmm0, (%eax)
@@ -31,9 +31,9 @@ entry:
 define void @shuf2(ptr %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
 ; X86-LABEL: shuf2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; X86-NEXT:    addps %xmm1, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    extractps $2, %xmm0, 8(%eax)
 ; X86-NEXT:    extractps $1, %xmm0, 4(%eax)
 ; X86-NEXT:    movss %xmm0, (%eax)
@@ -59,8 +59,8 @@ entry:
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, ptr %dst) nounwind {
 ; X86-LABEL: shuf3:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movaps %xmm1, (%eax)
 ; X86-NEXT:    retl
 ;
@@ -104,8 +104,8 @@ define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
 define void @shuf5(ptr %p) nounwind {
 ; X86-LABEL: shuf5:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsd {{.*#+}} xmm0 = [33,33,u,u,u,u,u,u,0,0,u,u,u,u,u,u]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 62ea5109f9df2..30d76f99eb77f 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
 ; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
 
@@ -29,6 +30,178 @@ declare i1 @getbool()
 declare i32 @__CxxFrameHandler3(...)
 
 define i32 @try_catch_catch() personality ptr @__CxxFrameHandler3 {
+; X86-LABEL: try_catch_catch:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl %esp, -28(%ebp)
+; X86-NEXT:    movl $-1, -16(%ebp)
+; X86-NEXT:    movl $___ehhandler$try_catch_catch, -20(%ebp)
+; X86-NEXT:    movl %fs:0, %eax
+; X86-NEXT:    movl %eax, -24(%ebp)
+; X86-NEXT:    leal -24(%ebp), %eax
+; X86-NEXT:    movl %eax, %fs:0
+; X86-NEXT:    movl $0, -16(%ebp)
+; X86-NEXT:    leal -36(%ebp), %eax
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl $1
+; X86-NEXT:    calll _f
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  LBB0_1: # %try.cont
+; X86-NEXT:  $ehgcr_0_1:
+; X86-NEXT:    movl -24(%ebp), %eax
+; X86-NEXT:    movl %eax, %fs:0
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+; X86-NEXT:  LBB0_3: # Block address taken
+; X86-NEXT:    # %handler1
+; X86-NEXT:    addl $12, %ebp
+; X86-NEXT:    jmp LBB0_1
+; X86-NEXT:  LBB0_5: # Block address taken
+; X86-NEXT:    # %handler2
+; X86-NEXT:    addl $12, %ebp
+; X86-NEXT:    jmp LBB0_1
+; X86-NEXT:    .def "?catch$2@?0?try_catch_catch at 4HA";
+; X86-NEXT:    .scl 3;
+; X86-NEXT:    .type 32;
+; X86-NEXT:    .endef
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  "?catch$2@?0?try_catch_catch at 4HA":
+; X86-NEXT:  LBB0_2: # %handler1
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    addl $12, %ebp
+; X86-NEXT:    movl %esp, -28(%ebp)
+; X86-NEXT:    movl $1, -16(%ebp)
+; X86-NEXT:    leal -36(%ebp), %eax
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl -32(%ebp)
+; X86-NEXT:    calll _f
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl $LBB0_3, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl # CATCHRET
+; X86-NEXT:    .def "?catch$4@?0?try_catch_catch at 4HA";
+; X86-NEXT:    .scl 3;
+; X86-NEXT:    .type 32;
+; X86-NEXT:    .endef
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  "?catch$4@?0?try_catch_catch at 4HA":
+; X86-NEXT:  LBB0_4: # %handler2
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    addl $12, %ebp
+; X86-NEXT:    movl %esp, -28(%ebp)
+; X86-NEXT:    movl $1, -16(%ebp)
+; X86-NEXT:    leal -36(%ebp), %eax
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl $3
+; X86-NEXT:    calll _f
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl $LBB0_5, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl # CATCHRET
+; X86-NEXT:  Lfunc_end0:
+;
+; X64-LABEL: try_catch_catch:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    subq $64, %rsp
+; X64-NEXT:    .seh_stackalloc 64
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    .seh_setframe %rbp, 64
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movq $-2, -16(%rbp)
+; X64-NEXT:  .Ltmp0: # EH_LABEL
+; X64-NEXT:    leaq -20(%rbp), %rdx
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    callq f
+; X64-NEXT:    nop
+; X64-NEXT:  .Ltmp1: # EH_LABEL
+; X64-NEXT:  .LBB0_1: # Block address taken
+; X64-NEXT:    # %try.cont
+; X64-NEXT:  $ehgcr_0_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $64, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_handlerdata
+; X64-NEXT:    .long $cppxdata$try_catch_catch at IMGREL
+; X64-NEXT:    .text
+; X64-NEXT:    .seh_endproc
+; X64-NEXT:    .def "?catch$2@?0?try_catch_catch at 4HA";
+; X64-NEXT:    .scl 3;
+; X64-NEXT:    .type 32;
+; X64-NEXT:    .endef
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  "?catch$2@?0?try_catch_catch at 4HA":
+; X64-NEXT:  .seh_proc "?catch$2@?0?try_catch_catch at 4HA"
+; X64-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; X64-NEXT:  .LBB0_2: # %handler1
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    .seh_stackalloc 32
+; X64-NEXT:    leaq 64(%rdx), %rbp
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movl -4(%rbp), %ecx
+; X64-NEXT:    leaq -20(%rbp), %rdx
+; X64-NEXT:    callq f
+; X64-NEXT:    leaq .LBB0_1(%rip), %rax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq # CATCHRET
+; X64-NEXT:    .seh_handlerdata
+; X64-NEXT:    .long $cppxdata$try_catch_catch at IMGREL
+; X64-NEXT:    .text
+; X64-NEXT:    .seh_endproc
+; X64-NEXT:    .def "?catch$3@?0?try_catch_catch at 4HA";
+; X64-NEXT:    .scl 3;
+; X64-NEXT:    .type 32;
+; X64-NEXT:    .endef
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  "?catch$3@?0?try_catch_catch at 4HA":
+; X64-NEXT:  .seh_proc "?catch$3@?0?try_catch_catch at 4HA"
+; X64-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; X64-NEXT:  .LBB0_3: # %handler2
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    .seh_stackalloc 32
+; X64-NEXT:    leaq 64(%rdx), %rbp
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    leaq -20(%rbp), %rdx
+; X64-NEXT:    movl $3, %ecx
+; X64-NEXT:    callq f
+; X64-NEXT:    leaq .LBB0_1(%rip), %rax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq # CATCHRET
+; X64-NEXT:  .Lfunc_end0:
+; X64-NEXT:    .seh_handlerdata
+; X64-NEXT:    .long $cppxdata$try_catch_catch at IMGREL
+; X64-NEXT:    .text
+; X64-NEXT:    .seh_endproc
 entry:
   %e.addr = alloca i32
   %local = alloca i32
@@ -225,6 +398,127 @@ try.cont:
 
 
 define i32 @branch_to_normal_dest() personality ptr @__CxxFrameHandler3 {
+; X86-LABEL: branch_to_normal_dest:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl %esp, -28(%ebp)
+; X86-NEXT:    movl $-1, -16(%ebp)
+; X86-NEXT:    movl $___ehhandler$branch_to_normal_dest, -20(%ebp)
+; X86-NEXT:    movl %fs:0, %eax
+; X86-NEXT:    movl %eax, -24(%ebp)
+; X86-NEXT:    leal -24(%ebp), %eax
+; X86-NEXT:    movl %eax, %fs:0
+; X86-NEXT:    movl $0, -16(%ebp)
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $1
+; X86-NEXT:    calll _f
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:  LBB1_1: # %try.cont
+; X86-NEXT:  $ehgcr_1_1:
+; X86-NEXT:    movl -24(%ebp), %eax
+; X86-NEXT:    movl %eax, %fs:0
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+; X86-NEXT:  LBB1_5: # Block address taken
+; X86-NEXT:    # %catch.done
+; X86-NEXT:    addl $12, %ebp
+; X86-NEXT:    jmp LBB1_1
+; X86-NEXT:    .def "?catch$2@?0?branch_to_normal_dest at 4HA";
+; X86-NEXT:    .scl 3;
+; X86-NEXT:    .type 32;
+; X86-NEXT:    .endef
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  "?catch$2@?0?branch_to_normal_dest at 4HA":
+; X86-NEXT:  LBB1_2: # %catch
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    addl $12, %ebp
+; X86-NEXT:    movl %esp, -28(%ebp)
+; X86-NEXT:    movl $1, -16(%ebp)
+; X86-NEXT:  LBB1_3: # %loop
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    calll _getbool
+; X86-NEXT:    testb $1, %al
+; X86-NEXT:    jne LBB1_3
+; X86-NEXT:  # %bb.4: # %catch.done
+; X86-NEXT:    movl $LBB1_5, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl # CATCHRET
+; X86-NEXT:  Lfunc_end1:
+;
+; X64-LABEL: branch_to_normal_dest:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    subq $48, %rsp
+; X64-NEXT:    .seh_stackalloc 48
+; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    .seh_setframe %rbp, 48
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:    movq $-2, -8(%rbp)
+; X64-NEXT:  .Ltmp2: # EH_LABEL
+; X64-NEXT:    movl $1, %ecx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    callq f
+; X64-NEXT:    nop
+; X64-NEXT:  .Ltmp3: # EH_LABEL
+; X64-NEXT:  .LBB1_1: # Block address taken
+; X64-NEXT:    # %try.cont
+; X64-NEXT:  $ehgcr_1_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $48, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq
+; X64-NEXT:    .seh_handlerdata
+; X64-NEXT:    .long $cppxdata$branch_to_normal_dest at IMGREL
+; X64-NEXT:    .text
+; X64-NEXT:    .seh_endproc
+; X64-NEXT:    .def "?catch$2@?0?branch_to_normal_dest at 4HA";
+; X64-NEXT:    .scl 3;
+; X64-NEXT:    .type 32;
+; X64-NEXT:    .endef
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  "?catch$2@?0?branch_to_normal_dest at 4HA":
+; X64-NEXT:  .seh_proc "?catch$2@?0?branch_to_normal_dest at 4HA"
+; X64-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; X64-NEXT:  .LBB1_2: # %catch
+; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .seh_pushreg %rbp
+; X64-NEXT:    subq $32, %rsp
+; X64-NEXT:    .seh_stackalloc 32
+; X64-NEXT:    leaq 48(%rdx), %rbp
+; X64-NEXT:    .seh_endprologue
+; X64-NEXT:  .LBB1_3: # %loop
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    callq getbool
+; X64-NEXT:    testb $1, %al
+; X64-NEXT:    jne .LBB1_3
+; X64-NEXT:  # %bb.4: # %catch.done
+; X64-NEXT:    leaq .LBB1_1(%rip), %rax
+; X64-NEXT:    .seh_startepilogue
+; X64-NEXT:    addq $32, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    .seh_endepilogue
+; X64-NEXT:    retq # CATCHRET
+; X64-NEXT:  .Lfunc_end1:
+; X64-NEXT:    .seh_handlerdata
+; X64-NEXT:    .long $cppxdata$branch_to_normal_dest at IMGREL
+; X64-NEXT:    .text
+; X64-NEXT:    .seh_endproc
 entry:
   invoke void @f(i32 1, ptr null)
           to label %try.cont unwind label %catch.dispatch
diff --git a/llvm/test/CodeGen/X86/win-smallparams.ll b/llvm/test/CodeGen/X86/win-smallparams.ll
index ecbc7e907a024..c0fb404261776 100644
--- a/llvm/test/CodeGen/X86/win-smallparams.ll
+++ b/llvm/test/CodeGen/X86/win-smallparams.ll
@@ -80,38 +80,32 @@ define i32 @manyargs(i8 %a, i16 %b, i8 %c, i16 %d, i8 %e, i16 %f) {
 ;
 ; WIN32-MSVC-LABEL: manyargs:
 ; WIN32-MSVC:       # %bb.0: # %entry
-; WIN32-MSVC-NEXT:    pushl %esi
-; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; WIN32-MSVC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; WIN32-MSVC-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; WIN32-MSVC-NEXT:    addl %eax, %ecx
-; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; WIN32-MSVC-NEXT:    addl %eax, %edx
-; WIN32-MSVC-NEXT:    movswl {{[0-9]+}}(%esp), %esi
-; WIN32-MSVC-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; WIN32-MSVC-NEXT:    addl %esi, %eax
-; WIN32-MSVC-NEXT:    addl %edx, %eax
+; WIN32-MSVC-NEXT:    addl %ecx, %edx
+; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-MSVC-NEXT:    addl %ecx, %eax
-; WIN32-MSVC-NEXT:    popl %esi
+; WIN32-MSVC-NEXT:    addl %edx, %eax
 ; WIN32-MSVC-NEXT:    retl
 ;
 ; WIN32-GNU-LABEL: manyargs:
 ; WIN32-GNU:       # %bb.0: # %entry
-; WIN32-GNU-NEXT:    pushl %esi
-; WIN32-GNU-NEXT:    .cfi_def_cfa_offset 8
-; WIN32-GNU-NEXT:    .cfi_offset %esi, -8
-; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; WIN32-GNU-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; WIN32-GNU-NEXT:    movsbl {{[0-9]+}}(%esp), %ecx
 ; WIN32-GNU-NEXT:    addl %eax, %ecx
-; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
 ; WIN32-GNU-NEXT:    addl %eax, %edx
-; WIN32-GNU-NEXT:    movswl {{[0-9]+}}(%esp), %esi
-; WIN32-GNU-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; WIN32-GNU-NEXT:    addl %esi, %eax
-; WIN32-GNU-NEXT:    addl %edx, %eax
+; WIN32-GNU-NEXT:    addl %ecx, %edx
+; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-GNU-NEXT:    addl %ecx, %eax
-; WIN32-GNU-NEXT:    popl %esi
+; WIN32-GNU-NEXT:    addl %edx, %eax
 ; WIN32-GNU-NEXT:    retl
 entry:
   %aa = sext i8 %a to i32
diff --git a/llvm/test/CodeGen/X86/win32-eh.ll b/llvm/test/CodeGen/X86/win32-eh.ll
index 857df9882be47..8316803c8a731 100644
--- a/llvm/test/CodeGen/X86/win32-eh.ll
+++ b/llvm/test/CodeGen/X86/win32-eh.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s
 ; RUN: llc -mtriple=i686-pc-windows-msvc -filetype=obj < %s -o %t
 
@@ -8,10 +9,47 @@ declare i32 @__CxxFrameHandler3(...)
 declare i32 @llvm.eh.typeid.for(ptr)
 
 define internal i32 @catchall_filt() {
+; CHECK-LABEL: catchall_filt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
   ret i32 1
 }
 
 define void @use_except_handler3() personality ptr @_except_handler3 {
+; CHECK-LABEL: use_except_handler3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $24, %esp
+; CHECK-NEXT:    movl %esp, -36(%ebp)
+; CHECK-NEXT:    movl $-1, -16(%ebp)
+; CHECK-NEXT:    movl $L__ehtable$use_except_handler3, -20(%ebp)
+; CHECK-NEXT:    movl $__except_handler3, -24(%ebp)
+; CHECK-NEXT:    movl %fs:0, %eax
+; CHECK-NEXT:    movl %eax, -28(%ebp)
+; CHECK-NEXT:    leal -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    movl $0, -16(%ebp)
+; CHECK-NEXT:    calll _may_throw_or_crash
+; CHECK-NEXT:  LBB1_1: # %cont
+; CHECK-NEXT:    movl -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    addl $24, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  LBB1_2: # %catch
+; CHECK-NEXT:  $ehgcr_1_2:
+; CHECK-NEXT:    movl -24(%ebp), %esp
+; CHECK-NEXT:    addl $12, %ebp
+; CHECK-NEXT:    jmp LBB1_1
+; CHECK-NEXT:  Lfunc_end0:
 entry:
   invoke void @may_throw_or_crash()
       to label %cont unwind label %lpad
@@ -54,6 +92,45 @@ catch:
 ; CHECK-NEXT:  .long   LBB1_2
 
 define void @use_except_handler4() personality ptr @_except_handler4 {
+; CHECK-LABEL: use_except_handler4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $28, %esp
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    movl %esp, -36(%ebp)
+; CHECK-NEXT:    movl ___security_cookie, %ecx
+; CHECK-NEXT:    movl $L__ehtable$use_except_handler4, %edx
+; CHECK-NEXT:    xorl %ecx, %edx
+; CHECK-NEXT:    movl %edx, -20(%ebp)
+; CHECK-NEXT:    xorl %ecx, %eax
+; CHECK-NEXT:    movl %eax, -40(%ebp)
+; CHECK-NEXT:    movl $-2, -16(%ebp)
+; CHECK-NEXT:    movl $__except_handler4, -24(%ebp)
+; CHECK-NEXT:    movl %fs:0, %eax
+; CHECK-NEXT:    movl %eax, -28(%ebp)
+; CHECK-NEXT:    leal -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    movl $0, -16(%ebp)
+; CHECK-NEXT:    calll _may_throw_or_crash
+; CHECK-NEXT:  LBB2_1: # %cont
+; CHECK-NEXT:    movl -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    addl $28, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  LBB2_2: # %catch
+; CHECK-NEXT:  $ehgcr_2_2:
+; CHECK-NEXT:    movl -24(%ebp), %esp
+; CHECK-NEXT:    addl $12, %ebp
+; CHECK-NEXT:    jmp LBB2_1
+; CHECK-NEXT:  Lfunc_end1:
 entry:
   invoke void @may_throw_or_crash()
       to label %cont unwind label %lpad
@@ -111,6 +188,45 @@ catch:
 ; CHECK-NEXT:  .long   LBB2_2
 
 define void @use_except_handler4_ssp() sspstrong personality ptr @_except_handler4 {
+; CHECK-LABEL: use_except_handler4_ssp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $28, %esp
+; CHECK-NEXT:    movl %ebp, %eax
+; CHECK-NEXT:    movl %esp, -36(%ebp)
+; CHECK-NEXT:    movl ___security_cookie, %ecx
+; CHECK-NEXT:    movl $L__ehtable$use_except_handler4_ssp, %edx
+; CHECK-NEXT:    xorl %ecx, %edx
+; CHECK-NEXT:    movl %edx, -20(%ebp)
+; CHECK-NEXT:    xorl %ecx, %eax
+; CHECK-NEXT:    movl %eax, -40(%ebp)
+; CHECK-NEXT:    movl $-2, -16(%ebp)
+; CHECK-NEXT:    movl $__except_handler4, -24(%ebp)
+; CHECK-NEXT:    movl %fs:0, %eax
+; CHECK-NEXT:    movl %eax, -28(%ebp)
+; CHECK-NEXT:    leal -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    movl $0, -16(%ebp)
+; CHECK-NEXT:    calll _may_throw_or_crash
+; CHECK-NEXT:  LBB3_1: # %cont
+; CHECK-NEXT:    movl -28(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    addl $28, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  LBB3_2: # %catch
+; CHECK-NEXT:  $ehgcr_3_2:
+; CHECK-NEXT:    movl -24(%ebp), %esp
+; CHECK-NEXT:    addl $12, %ebp
+; CHECK-NEXT:    jmp LBB3_1
+; CHECK-NEXT:  Lfunc_end2:
 entry:
   invoke void @may_throw_or_crash()
       to label %cont unwind label %lpad
@@ -147,7 +263,7 @@ catch:
 ; CHECK-NEXT: movl $0, -16(%ebp)
 ; CHECK-NEXT: calll _may_throw_or_crash
 ; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
-; CHECK-NEXT: movl %[[next]], %fs:0   
+; CHECK-NEXT: movl %[[next]], %fs:0
 ; CHECK: retl
 ; CHECK-NEXT: [[catch:[^ ,]*]]: # %catch{{$}}
 
@@ -157,13 +273,58 @@ catch:
 ; CHECK-LABEL: L__ehtable$use_except_handler4_ssp:
 ; CHECK-NEXT:  .long   -2
 ; CHECK-NEXT:  .long   0
-; CHECK-NEXT:  .long   -40  
+; CHECK-NEXT:  .long   -40
 ; CHECK-NEXT:  .long   0
 ; CHECK-NEXT:  .long   -2
 ; CHECK-NEXT:  .long   _catchall_filt
 ; CHECK-NEXT:  .long   [[catch]]
 
 define void @use_CxxFrameHandler3() personality ptr @__CxxFrameHandler3 {
+; CHECK-LABEL: use_CxxFrameHandler3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:    movl %esp, -28(%ebp)
+; CHECK-NEXT:    movl $-1, -16(%ebp)
+; CHECK-NEXT:    movl $___ehhandler$use_CxxFrameHandler3, -20(%ebp)
+; CHECK-NEXT:    movl %fs:0, %eax
+; CHECK-NEXT:    movl %eax, -24(%ebp)
+; CHECK-NEXT:    leal -24(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    movl $0, -16(%ebp)
+; CHECK-NEXT:    calll _may_throw_or_crash
+; CHECK-NEXT:  LBB4_1: # %cont
+; CHECK-NEXT:  $ehgcr_4_1:
+; CHECK-NEXT:    movl -24(%ebp), %eax
+; CHECK-NEXT:    movl %eax, %fs:0
+; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  LBB4_3: # Block address taken
+; CHECK-NEXT:    # %catch
+; CHECK-NEXT:    addl $12, %ebp
+; CHECK-NEXT:    jmp LBB4_1
+; CHECK-NEXT:    .def "?catch$2@?0?use_CxxFrameHandler3 at 4HA";
+; CHECK-NEXT:    .scl 3;
+; CHECK-NEXT:    .type 32;
+; CHECK-NEXT:    .endef
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  "?catch$2@?0?use_CxxFrameHandler3 at 4HA":
+; CHECK-NEXT:  LBB4_2: # %catch
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    addl $12, %ebp
+; CHECK-NEXT:    movl %esp, -28(%ebp)
+; CHECK-NEXT:    movl $LBB4_3, %eax
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl # CATCHRET
+; CHECK-NEXT:  Lfunc_end3:
   invoke void @may_throw_or_crash()
       to label %cont unwind label %catchall
 cont:
@@ -212,6 +373,13 @@ catch:
 ; CHECK-LABEL: inlineasm:
 ; CHECK: .safeseh my_handler
 define i32 @inlineasm() {
+; CHECK-LABEL: inlineasm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    .safeseh my_handler
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
 entry:
   call void asm sideeffect ".safeseh my_handler", "~{dirflag},~{fpsr},~{flags}"()
   ret i32 0
diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
index 5ac90a0af2e57..d13bdadae4991 100644
--- a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
+++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
@@ -5,11 +5,17 @@
 define i64 @test_sdiv_i64(i64 %a, i64 %b) {
 ; CHECK32-LABEL: test_sdiv_i64:
 ; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    pushl %ecx
+; CHECK32-NEXT:    pushl %eax
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    pushl %edx
 ; CHECK32-NEXT:    calll __alldiv
+; CHECK32-NEXT:    popl %esi
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: test_sdiv_i64:
@@ -26,11 +32,17 @@ define i64 @test_sdiv_i64(i64 %a, i64 %b) {
 define i64 @test_srem_i64(i64 %a, i64 %b) {
 ; CHECK32-LABEL: test_srem_i64:
 ; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    pushl %ecx
+; CHECK32-NEXT:    pushl %eax
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    pushl %edx
 ; CHECK32-NEXT:    calll __allrem
+; CHECK32-NEXT:    popl %esi
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: test_srem_i64:
@@ -48,11 +60,17 @@ define i64 @test_srem_i64(i64 %a, i64 %b) {
 define i64 @test_udiv_i64(i64 %a, i64 %b) {
 ; CHECK32-LABEL: test_udiv_i64:
 ; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    pushl %ecx
+; CHECK32-NEXT:    pushl %eax
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    pushl %edx
 ; CHECK32-NEXT:    calll __aulldiv
+; CHECK32-NEXT:    popl %esi
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: test_udiv_i64:
@@ -69,11 +87,17 @@ define i64 @test_udiv_i64(i64 %a, i64 %b) {
 define i64 @test_urem_i64(i64 %a, i64 %b) {
 ; CHECK32-LABEL: test_urem_i64:
 ; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    pushl %ecx
+; CHECK32-NEXT:    pushl %eax
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    pushl %edx
 ; CHECK32-NEXT:    calll __aullrem
+; CHECK32-NEXT:    popl %esi
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: test_urem_i64:
diff --git a/llvm/test/CodeGen/X86/win32_sret.ll b/llvm/test/CodeGen/X86/win32_sret.ll
index 6d0e47bccfa76..1352da05652fe 100644
--- a/llvm/test/CodeGen/X86/win32_sret.ll
+++ b/llvm/test/CodeGen/X86/win32_sret.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; We specify -mcpu explicitly to avoid instruction reordering that happens on
 ; some setups (e.g., Atom) from affecting the output.
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
@@ -15,19 +16,9 @@
 
 define void @sret1(ptr sret(i8) %x) nounwind {
 entry:
-; WIN32-LABEL:      _sret1:
-; WIN32:      movb $42, ({{%e[abcd]x}})
-; WIN32-NOT:  popl %eax
-; WIN32:    {{retl$}}
 
-; MINGW_X86-LABEL:  _sret1:
-; MINGW_X86:  {{retl$}}
 
-; CYGWIN-LABEL:     _sret1:
-; CYGWIN:     retl $4
 
-; LINUX-LABEL:      sret1:
-; LINUX:      retl $4
 
   store i8 42, ptr %x, align 4
   ret void
@@ -35,19 +26,9 @@ entry:
 
 define void @sret2(ptr sret(i8) %x, i8 %y) nounwind {
 entry:
-; WIN32-LABEL:      _sret2:
-; WIN32:      movb {{.*}}, ({{%e[abcd]x}})
-; WIN32-NOT:  popl %eax
-; WIN32:    {{retl$}}
 
-; MINGW_X86-LABEL:  _sret2:
-; MINGW_X86:  {{retl$}}
 
-; CYGWIN-LABEL:     _sret2:
-; CYGWIN:     retl $4
 
-; LINUX-LABEL:      sret2:
-; LINUX:      retl $4
 
   store i8 %y, ptr %x
   ret void
@@ -55,20 +36,9 @@ entry:
 
 define void @sret3(ptr sret(i8) %x, ptr %y) nounwind {
 entry:
-; WIN32-LABEL:      _sret3:
-; WIN32:      movb $42, ([[REG1:%e[abcd]x]])
-; WIN32-NOT:  movb $13, ([[REG1]])
-; WIN32-NOT:  popl %eax
-; WIN32:    {{retl$}}
 
-; MINGW_X86-LABEL:  _sret3:
-; MINGW_X86:  {{retl$}}
 
-; CYGWIN-LABEL:     _sret3:
-; CYGWIN:     retl $4
 
-; LINUX-LABEL:      sret3:
-; LINUX:      retl $4
 
   store i8 42, ptr %x
   store i8 13, ptr %y
@@ -80,19 +50,9 @@ entry:
 
 define void @sret4(ptr noalias sret(%struct.S4) %agg.result) {
 entry:
-; WIN32-LABEL:     _sret4:
-; WIN32:     movl $42, ({{%e[abcd]x}})
-; WIN32-NOT: popl %eax
-; WIN32:   {{retl$}}
 
-; MINGW_X86-LABEL: _sret4:
-; MINGW_X86: {{retl$}}
 
-; CYGWIN-LABEL:    _sret4:
-; CYGWIN:    retl $4
 
-; LINUX-LABEL:     sret4:
-; LINUX:     retl $4
 
   store i32 42, ptr %agg.result, align 4
   ret void
@@ -126,48 +86,26 @@ entry:
   %c = alloca %class.C5, align 1
   %s = alloca %struct.S5, align 4
   call x86_thiscallcc void @"\01?foo at C5@@QAE?AUS5@@XZ"(ptr sret(%struct.S5) %s, ptr %c)
-; WIN32-LABEL:      {{^}}_call_foo5:
-; MINGW_X86-LABEL:  {{^}}_call_foo5:
-; CYGWIN-LABEL:     {{^}}_call_foo5:
-; LINUX-LABEL:      {{^}}call_foo5:
 
 
 ; Load the address of the result and put it onto stack
 ; The this pointer goes to ECX.
 ; (through %ecx in the -O0 build).
-; WIN32-DAG:  leal {{[0-9]*}}(%esp), %e{{[a-d]}}x
-; WIN32-DAG:  {{leal [1-9]+\(%esp\)|movl %esp}}, %ecx
-; WIN32-DAG:  {{pushl %e[a-d]x|movl %e[a-d]x, \(%esp\)}}
-; WIN32-NEXT: calll "?foo at C5@@QAE?AUS5@@XZ"
-; WIN32:      retl
   ret void
 }
 
 
 %struct.test6 = type { i32, i32, i32 }
 define void @test6_f(ptr %x) nounwind {
-; WIN32-LABEL: _test6_f:
-; MINGW_X86-LABEL: _test6_f:
-; CYGWIN-LABEL: _test6_f:
-; LINUX-LABEL: test6_f:
 
 ; The %x argument is moved to %ecx. It will be the this pointer.
-; WIN32-DAG: movl    {{16|20}}(%esp), %ecx
 
 
 ; The sret pointer is (%esp)
-; WIN32-DAG:      {{leal 4\(%esp\)|movl %esp}}, %eax
-; WIN32-DAG:      {{pushl   %eax|movl %eax, \(%esp\)}}
 
 ; The sret pointer is %ecx
 ; The %x argument is moved to (%esp). It will be the this pointer.
-; MINGW_X86-DAG:  {{leal 4\(%esp\)|movl %esp}}, %ecx
-; MINGW_X86-DAG: {{pushl   16\(%esp\)|movl %eax, \(%esp\)}}
-; MINGW_X86-NEXT: calll   _test6_g
 
-; CYGWIN-DAG:  {{leal 4\(%esp\)|movl %esp}}, %ecx
-; CYGWIN-DAG:  {{pushl   16\(%esp\)|movl %eax, \(%esp\)}}
-; CYGWIN-NEXT: calll   _test6_g
 
   %tmp = alloca %struct.test6, align 4
   call x86_thiscallcc void @test6_g(ptr sret(%struct.test6) %tmp, ptr %x)
@@ -178,23 +116,10 @@ declare x86_thiscallcc void @test6_g(ptr sret(%struct.test6), ptr)
 ; Flipping the parameters at the IR level generates the same code.
 %struct.test7 = type { i32, i32, i32 }
 define void @test7_f(ptr %x) nounwind {
-; WIN32-LABEL: _test7_f:
-; MINGW_X86-LABEL: _test7_f:
-; CYGWIN-LABEL: _test7_f:
-; LINUX-LABEL: test7_f:
 
 ; The %x argument is moved to %ecx on all OSs. It will be the this pointer.
-; WIN32:      movl    {{16|20}}(%esp), %ecx
-; MINGW_X86:  movl    {{16|20}}(%esp), %ecx
-; CYGWIN:     movl    {{16|20}}(%esp), %ecx
 
 ; The sret pointer is (%esp)
-; WIN32:      {{leal 4\(%esp\)|movl %esp}}, %eax
-; WIN32-NEXT:     {{pushl   %eax|movl %eax, \(%esp\)}}
-; MINGW_X86:      {{leal 4\(%esp\)|movl %esp}}, %eax
-; MINGW_X86-NEXT: {{pushl   %eax|movl %eax, \(%esp\)}}
-; CYGWIN:      {{leal 4\(%esp\)|movl %esp}}, %eax
-; CYGWIN-NEXT: {{pushl   %eax|movl %eax, \(%esp\)}}
 
   %tmp = alloca %struct.test7, align 4
   call x86_thiscallcc void @test7_g(ptr %x, ptr sret(%struct.test7) %tmp)
@@ -208,10 +133,6 @@ define x86_thiscallcc void @test7_g(ptr %in, ptr sret(%struct.test7) %out) {
   ret void
 
 ; Make sure we return the second parameter in %eax.
-; WIN32-LABEL: _test7_g:
-; WIN32: calll _clobber_eax
-; WIN32: movl {{.*}}, %eax
-; WIN32: retl
 }
 
 declare void @clobber_eax()
@@ -224,11 +145,9 @@ define void @test8_f(i64 inreg %a, ptr sret(i64) %out) {
   call void @clobber_eax()
   ret void
 
-; WIN32-LABEL: _test8_f:
-; WIN32: movl {{[0-9]+}}(%esp), %[[out:[a-z]+]]
-; WIN32-DAG: movl {{%e[abcd]x}}, 4(%[[out]])
-; WIN32-DAG: movl {{%e[abcd]x}}, (%[[out]])
-; WIN32: calll _clobber_eax
-; WIN32: movl {{.*}}, %eax
-; WIN32: retl
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CYGWIN: {{.*}}
+; LINUX: {{.*}}
+; MINGW_X86: {{.*}}
+; WIN32: {{.*}}
diff --git a/llvm/test/CodeGen/X86/x32-va_start.ll b/llvm/test/CodeGen/X86/x32-va_start.ll
index 31c8aee3fddec..bf8a6828d1c83 100644
--- a/llvm/test/CodeGen/X86/x32-va_start.ll
+++ b/llvm/test/CodeGen/X86/x32-va_start.ll
@@ -100,15 +100,14 @@ define i32 @foo(float %a, ptr nocapture readnone %fmt, ...) nounwind {
 ; X32BITABI-LABEL: foo:
 ; X32BITABI:       # %bb.0: # %entry
 ; X32BITABI-NEXT:    subl $28, %esp
-; X32BITABI-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X32BITABI-NEXT:    movl %ecx, (%esp)
-; X32BITABI-NEXT:    cmpl $40, %ecx
+; X32BITABI-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32BITABI-NEXT:    movl %eax, (%esp)
+; X32BITABI-NEXT:    cmpl $40, %eax
 ; X32BITABI-NEXT:    ja .LBB0_2
 ; X32BITABI-NEXT:  # %bb.1: # %vaarg.in_reg
-; X32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32BITABI-NEXT:    addl %ecx, %eax
-; X32BITABI-NEXT:    addl $8, %ecx
+; X32BITABI-NEXT:    leal 8(%eax), %ecx
 ; X32BITABI-NEXT:    movl %ecx, (%esp)
+; X32BITABI-NEXT:    addl {{[0-9]+}}(%esp), %eax
 ; X32BITABI-NEXT:    jmp .LBB0_3
 ; X32BITABI-NEXT:  .LBB0_2: # %vaarg.in_mem
 ; X32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/x86-32-intrcc.ll b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
index a0f937e2c323b..87e198dc35641 100644
--- a/llvm/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
@@ -61,8 +61,8 @@ define x86_intrcc void @test_isr_ecode(ptr byval(%struct.interrupt_frame) %frame
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    cld
-; CHECK-NEXT:    movl 4(%ebp), %eax
-; CHECK-NEXT:    movl 16(%ebp), %ecx
+; CHECK-NEXT:    movl 16(%ebp), %eax
+; CHECK-NEXT:    movl 4(%ebp), %ecx
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    leal -8(%ebp), %esp
@@ -153,8 +153,8 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    fldt f80
 ; CHECK-NEXT:    fld1
+; CHECK-NEXT:    fldt f80
 ; CHECK-NEXT:    faddp %st, %st(1)
 ; CHECK-NEXT:    fstpt f80
 ; CHECK-NEXT:    movl %ebp, %esp
@@ -187,16 +187,14 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
-; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    leal 4(%ebp), %eax
+; CHECK-NEXT:    movl %eax, sink_address
 ; CHECK-NEXT:    leal 20(%ebp), %eax
-; CHECK-NEXT:    leal 4(%ebp), %ecx
-; CHECK-NEXT:    movl %ecx, sink_address
 ; CHECK-NEXT:    movl %eax, sink_address
-; CHECK-NEXT:    leal -8(%ebp), %esp
+; CHECK-NEXT:    leal -4(%ebp), %esp
 ; CHECK-NEXT:    popl %eax
-; CHECK-NEXT:    popl %ecx
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    iretl
 ;
@@ -230,20 +228,16 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
-; CHECK-NEXT:    pushl %edx
-; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    leal 8(%ebp), %eax
+; CHECK-NEXT:    movl %eax, sink_address
+; CHECK-NEXT:    leal 24(%ebp), %eax
+; CHECK-NEXT:    movl %eax, sink_address
 ; CHECK-NEXT:    movl 4(%ebp), %eax
-; CHECK-NEXT:    leal 24(%ebp), %ecx
-; CHECK-NEXT:    leal 8(%ebp), %edx
-; CHECK-NEXT:    movl %edx, sink_address
-; CHECK-NEXT:    movl %ecx, sink_address
 ; CHECK-NEXT:    movl %eax, sink_i32
-; CHECK-NEXT:    leal -12(%ebp), %esp
+; CHECK-NEXT:    leal -4(%ebp), %esp
 ; CHECK-NEXT:    popl %eax
-; CHECK-NEXT:    popl %ecx
-; CHECK-NEXT:    popl %edx
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    addl $4, %esp
 ; CHECK-NEXT:    iretl
diff --git a/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
index e5732d76ee8b3..1c8b40f7d7d3c 100644
--- a/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
@@ -5,9 +5,9 @@
 define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
 ; DARWIN-LABEL: test_sse:
 ; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; DARWIN-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; DARWIN-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
-; DARWIN-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; DARWIN-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; DARWIN-NEXT:    retl
 ;
 ; LINUX-LABEL: test_sse:
@@ -27,9 +27,9 @@ define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %
 define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
 ; DARWIN-LABEL: test_avx:
 ; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
 ; DARWIN-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; DARWIN-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
-; DARWIN-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; DARWIN-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; DARWIN-NEXT:    retl
 ;
 ; LINUX-LABEL: test_avx:
@@ -53,9 +53,9 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %
 define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
 ; DARWIN-LABEL: test_avx512:
 ; DARWIN:       ## %bb.0:
+; DARWIN-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
 ; DARWIN-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; DARWIN-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
-; DARWIN-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; DARWIN-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; DARWIN-NEXT:    retl
 ;
 ; LINUX-LABEL: test_avx512:
diff --git a/llvm/test/CodeGen/X86/x86-shifts.ll b/llvm/test/CodeGen/X86/x86-shifts.ll
index 8d469a39a5700..aba26ce27f4f2 100644
--- a/llvm/test/CodeGen/X86/x86-shifts.ll
+++ b/llvm/test/CodeGen/X86/x86-shifts.ll
@@ -5,13 +5,21 @@
 ; Splat patterns below
 
 define <4 x i32> @shl4(<4 x i32> %A) nounwind {
-; CHECK-LABEL: shl4:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    pslld $2, %xmm1
-; CHECK-NEXT:    paddd %xmm0, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shl4:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    paddd %xmm1, %xmm1
+; X86-NEXT:    pslld $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl4:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pslld $2, %xmm1
+; X64-NEXT:    paddd %xmm0, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
   %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
@@ -20,13 +28,21 @@ entry:
 }
 
 define <4 x i32> @shr4(<4 x i32> %A) nounwind {
-; CHECK-LABEL: shr4:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $2, %xmm1
-; CHECK-NEXT:    psrld $1, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shr4:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $1, %xmm1
+; X86-NEXT:    psrld $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shr4:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrld $2, %xmm1
+; X64-NEXT:    psrld $1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = lshr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
   %C = lshr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
@@ -35,13 +51,21 @@ entry:
 }
 
 define <4 x i32> @sra4(<4 x i32> %A) nounwind {
-; CHECK-LABEL: sra4:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrad $2, %xmm1
-; CHECK-NEXT:    psrad $1, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: sra4:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrad $1, %xmm1
+; X86-NEXT:    psrad $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sra4:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrad $2, %xmm1
+; X64-NEXT:    psrad $1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = ashr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
   %C = ashr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
@@ -50,13 +74,21 @@ entry:
 }
 
 define <2 x i64> @shl2(<2 x i64> %A) nounwind {
-; CHECK-LABEL: shl2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psllq $2, %xmm1
-; CHECK-NEXT:    psllq $9, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shl2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psllq $9, %xmm1
+; X86-NEXT:    psllq $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psllq $2, %xmm1
+; X64-NEXT:    psllq $9, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = shl <2 x i64> %A,  < i64 2, i64 2>
   %C = shl <2 x i64> %A,  < i64 9, i64 9>
@@ -65,13 +97,21 @@ entry:
 }
 
 define <2 x i64> @shr2(<2 x i64> %A) nounwind {
-; CHECK-LABEL: shr2:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $8, %xmm1
-; CHECK-NEXT:    psrlq $1, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shr2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlq $1, %xmm1
+; X86-NEXT:    psrlq $8, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shr2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrlq $8, %xmm1
+; X64-NEXT:    psrlq $1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = lshr <2 x i64> %A,  < i64 8, i64 8>
   %C = lshr <2 x i64> %A,  < i64 1, i64 1>
@@ -80,13 +120,21 @@ entry:
 }
 
 define <8 x i16> @shl8(<8 x i16> %A) nounwind {
-; CHECK-LABEL: shl8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psllw $2, %xmm1
-; CHECK-NEXT:    paddw %xmm0, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shl8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    paddw %xmm1, %xmm1
+; X86-NEXT:    psllw $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psllw $2, %xmm1
+; X64-NEXT:    paddw %xmm0, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -95,13 +143,21 @@ entry:
 }
 
 define <8 x i16> @shr8(<8 x i16> %A) nounwind {
-; CHECK-LABEL: shr8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlw $2, %xmm1
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shr8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlw $1, %xmm1
+; X86-NEXT:    psrlw $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shr8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrlw $2, %xmm1
+; X64-NEXT:    psrlw $1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = lshr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %C = lshr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -110,13 +166,21 @@ entry:
 }
 
 define <8 x i16> @sra8(<8 x i16> %A) nounwind {
-; CHECK-LABEL: sra8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psraw $2, %xmm1
-; CHECK-NEXT:    psraw $1, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: sra8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psraw $1, %xmm1
+; X86-NEXT:    psraw $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sra8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psraw $2, %xmm1
+; X64-NEXT:    psraw $1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = ashr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %C = ashr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -129,9 +193,9 @@ entry:
 define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
 ; X86-LABEL: sll8_nosplat:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,64,4,4,4,4]
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [512,128,32,2,16,2,2,2]
 ; X86-NEXT:    pmullw %xmm0, %xmm1
-; X86-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,128,32,2,16,2,2,2]
+; X86-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,4,8,64,4,4,4,4]
 ; X86-NEXT:    pxor %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -150,16 +214,27 @@ entry:
 }
 
 define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
-; CHECK-LABEL: shr2_nosplat:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $8, %xmm1
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $1, %xmm2
-; CHECK-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; CHECK-NEXT:    xorpd %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shr2_nosplat:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrlq $1, %xmm1
+; X86-NEXT:    movdqa %xmm0, %xmm2
+; X86-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X86-NEXT:    psrlq $8, %xmm0
+; X86-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X86-NEXT:    xorpd %xmm2, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shr2_nosplat:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrlq $8, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psrlq $1, %xmm2
+; X64-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
+; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-NEXT:    xorpd %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = lshr <2 x i64> %A,  < i64 8, i64 1>
   %C = lshr <2 x i64> %A,  < i64 1, i64 0>
@@ -170,13 +245,21 @@ entry:
 ; Other shifts
 
 define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
-; CHECK-LABEL: shl2_other:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    pslld $2, %xmm1
-; CHECK-NEXT:    pslld $9, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shl2_other:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    pslld $9, %xmm1
+; X86-NEXT:    pslld $2, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shl2_other:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pslld $2, %xmm1
+; X64-NEXT:    pslld $9, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = shl <2 x i32> %A,  < i32 2, i32 2>
   %C = shl <2 x i32> %A,  < i32 9, i32 9>
@@ -185,13 +268,21 @@ entry:
 }
 
 define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
-; CHECK-LABEL: shr2_other:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $8, %xmm1
-; CHECK-NEXT:    psrld $1, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: shr2_other:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movdqa %xmm0, %xmm1
+; X86-NEXT:    psrld $1, %xmm1
+; X86-NEXT:    psrld $8, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: shr2_other:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    psrld $8, %xmm1
+; X64-NEXT:    psrld $1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
 entry:
   %B = lshr <2 x i32> %A,  < i32 8, i32 8>
   %C = lshr <2 x i32> %A,  < i32 1, i32 1>
diff --git a/llvm/test/CodeGen/X86/x87.ll b/llvm/test/CodeGen/X86/x87.ll
index aa0d1c5206b63..acc2f4c939a42 100644
--- a/llvm/test/CodeGen/X86/x87.ll
+++ b/llvm/test/CodeGen/X86/x87.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=i686-- | FileCheck %s -check-prefixes=X8732,X87
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse | FileCheck %s -check-prefixes=X8732,X87
 ; RUN: llc < %s -mtriple=i686-- -mattr=-x87 | FileCheck %s -check-prefixes=NOX8732,NOX87
@@ -6,53 +7,35 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse2 | FileCheck %s -check-prefixes=NOX87
 
 define void @test(i32 %i, i64 %l, ptr %pf, ptr %pd, ptr %pld) nounwind readnone {
-; X87-LABEL: test:
-; NOX87-LABEL: test:
 
-; NOX87-NOT: {{ }}f{{.*}}
 
-; X87: fild
-; NOX8732: __floatunsisf
   %tmp = uitofp i32 %i to float
 
-; X8732: fild
-; NOX8732: __floatdisf
   %tmp1 = sitofp i64 %l to float
 
-; X8732: fadd
-; NOX8732: __addsf3
   %tmp2 = fadd float %tmp, %tmp1
 
-; X8732: fstp
   store float %tmp2, ptr %pf
 
-; X87: fild
-; NOX87: __floatunsidf
   %tmp3 = uitofp i32 %i to double
 
-; X87: fild
-; NOX87: __floatdidf
   %tmp4 = sitofp i64 %l to double
 
-; X87: fadd
-; NOX87: __adddf3
   %tmp5 = fadd double %tmp3, %tmp4
 
-; X87: fstp
   store double %tmp5, ptr %pd
 
-; X87: __floatsitf
-; NOX87: __floatsitf
   %tmp6 = sitofp i32 %i to fp128
 
-; X87: __floatunditf
-; NOX87: __floatunditf
   %tmp7 = uitofp i64 %l to fp128
 
-; X87: __addtf3
-; NOX87: __addtf3
   %tmp8 = fadd fp128 %tmp6, %tmp7
   store fp128 %tmp8, ptr %pld
 
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOX87: {{.*}}
+; NOX8732: {{.*}}
+; X87: {{.*}}
+; X8732: {{.*}}
diff --git a/llvm/test/CodeGen/X86/xaluo128.ll b/llvm/test/CodeGen/X86/xaluo128.ll
index 977df0f16bb28..7a57300ffe347 100644
--- a/llvm/test/CodeGen/X86/xaluo128.ll
+++ b/llvm/test/CodeGen/X86/xaluo128.ll
@@ -14,26 +14,22 @@ define zeroext i1 @saddoi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ;
 ; X86-LABEL: saddoi128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    seto %al
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %t = call {i128, i1} @llvm.sadd.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
@@ -54,26 +50,22 @@ define zeroext i1 @uaddoi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ;
 ; X86-LABEL: uaddoi128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %t = call {i128, i1} @llvm.uadd.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
@@ -95,26 +87,22 @@ define zeroext i1 @ssuboi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ;
 ; X86-LABEL: ssuboi128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    seto %al
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %t = call {i128, i1} @llvm.ssub.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
@@ -135,26 +123,22 @@ define zeroext i1 @usuboi128(i128 %v1, i128 %v2, ptr %res) nounwind {
 ;
 ; X86-LABEL: usuboi128:
 ; X86:       ## %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
   %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index a076d0d762aa3..14b35aa3d24a3 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -90,12 +90,11 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi8:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movb %al, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -129,11 +128,11 @@ define zeroext i1 @smuloi16(i16 %v1, i16 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi16:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %dx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %cx
+; WIN32-NEXT:    movw %cx, (%eax)
 ; WIN32-NEXT:    seto %al
-; WIN32-NEXT:    movw %dx, (%ecx)
 ; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
@@ -167,11 +166,11 @@ define zeroext i1 @smuloi32(i32 %v1, i32 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi32:
 ; WIN32:       # %bb.0:
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    seto %al
-; WIN32-NEXT:    movl %edx, (%ecx)
 ; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -209,54 +208,51 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %eax, (%edx)
+; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ebx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %ecx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %ebp, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    adcl %ecx, %ebx
+; WIN32-NEXT:    movl %ebx, %ebp
 ; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    xorl %ecx, %edx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    xorl %esi, %edx
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    orl %edx, %esi
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -303,12 +299,11 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi8:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movb %al, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
@@ -352,14 +347,11 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi16:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movw %ax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    movw %ax, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
@@ -401,14 +393,11 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    movl %eax, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -454,32 +443,34 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %eax, (%edx)
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %al, %bl
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    seto %dl
+; WIN32-NEXT:    orb %bh, %dl
+; WIN32-NEXT:    orb %bl, %dl
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %ecx, 4(%eax)
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %dl, %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -553,11 +544,11 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    imull %edi, %esi
 ; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %ecx
@@ -675,42 +666,38 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    orb %ch, %bl
+; WIN32-NEXT:    orb %cl, %bl
+; WIN32-NEXT:    leal (%ebp,%eax), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %al, %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    orb %bl, %bh
-; WIN32-NEXT:    addl %eax, %edi
-; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    addl %ecx, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bh, %al
+; WIN32-NEXT:    orb %bl, %al
 ; WIN32-NEXT:    testb %al, %al
 ; WIN32-NEXT:    jne LBB14_2
 ; WIN32-NEXT:  # %bb.1:
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:  LBB14_2:
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl %edi, %edx
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -958,13 +945,13 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %ebp, %eax
@@ -976,12 +963,11 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %esi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %ecx
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    addl %ebp, %esi
@@ -1266,28 +1252,28 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    seto %cl
+; WIN32-NEXT:    orb %bh, %cl
+; WIN32-NEXT:    orb %bl, %cl
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %ch, %al
+; WIN32-NEXT:    orb %cl, %al
 ; WIN32-NEXT:    subb $1, %al
 ; WIN32-NEXT:    je LBB22_1
 ; WIN32-NEXT:  # %bb.3: # %continue
@@ -1386,13 +1372,12 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi8_load:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movzbl (%eax), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    imulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movb %al, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v1 = load i8, ptr %ptr1
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1435,13 +1420,12 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi8_load2:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    imulb (%ecx)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imulb (%edx)
+; WIN32-NEXT:    movb %al, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v2 = load i8, ptr %ptr2
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1476,12 +1460,12 @@ define zeroext i1 @smuloi16_load(ptr %ptr1, i16 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi16_load:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movzwl (%eax), %edx
-; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %dx
+; WIN32-NEXT:    movzwl (%eax), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imulw {{[0-9]+}}(%esp), %ax
+; WIN32-NEXT:    movw %ax, (%ecx)
 ; WIN32-NEXT:    seto %al
-; WIN32-NEXT:    movw %dx, (%ecx)
 ; WIN32-NEXT:    retl
   %v1 = load i16, ptr %ptr1
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -1516,12 +1500,12 @@ define zeroext i1 @smuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi16_load2:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imulw (%eax), %dx
+; WIN32-NEXT:    imulw (%ecx), %dx
+; WIN32-NEXT:    movw %dx, (%eax)
 ; WIN32-NEXT:    seto %al
-; WIN32-NEXT:    movw %dx, (%ecx)
 ; WIN32-NEXT:    retl
   %v2 = load i16, ptr %ptr2
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -1556,12 +1540,12 @@ define zeroext i1 @smuloi32_load(ptr %ptr1, i32 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi32_load:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %edx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl (%eax), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, (%ecx)
 ; WIN32-NEXT:    seto %al
-; WIN32-NEXT:    movl %edx, (%ecx)
 ; WIN32-NEXT:    retl
   %v1 = load i32, ptr %ptr1
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -1596,12 +1580,12 @@ define zeroext i1 @smuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) {
 ;
 ; WIN32-LABEL: smuloi32_load2:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull (%eax), %edx
+; WIN32-NEXT:    imull (%ecx), %edx
+; WIN32-NEXT:    movl %edx, (%eax)
 ; WIN32-NEXT:    seto %al
-; WIN32-NEXT:    movl %edx, (%ecx)
 ; WIN32-NEXT:    retl
   %v2 = load i32, ptr %ptr2
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -1640,57 +1624,55 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ecx
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl (%eax), %esi
+; WIN32-NEXT:    movl 4(%eax), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %eax, (%edx)
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edi, %ebx
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    adcl %ebx, %edi
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %ebp, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    adcl %ecx, %ebx
+; WIN32-NEXT:    movl %ebx, %ebp
 ; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    xorl %ecx, %edx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    xorl %esi, %edx
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    orl %edx, %esi
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1733,57 +1715,54 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl (%ecx), %ebx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    movl (%eax), %ecx
+; WIN32-NEXT:    movl 4(%eax), %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl 4(%eax), %ecx
-; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %eax, (%edx)
+; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ebx, %ebp
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %ecx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %ebp, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    adcl %ecx, %ebx
+; WIN32-NEXT:    movl %ebx, %ebp
 ; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull (%esp) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    xorl %ecx, %edx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    xorl %esi, %edx
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    orl %edx, %esi
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1829,13 +1808,12 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi8_load:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movzbl (%eax), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mulb {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movb %al, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v1 = load i8, ptr %ptr1
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1878,13 +1856,12 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi8_load2:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mulb (%ecx)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movb %al, (%edx)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mulb (%edx)
+; WIN32-NEXT:    movb %al, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v2 = load i8, ptr %ptr2
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -1928,15 +1905,12 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi16_load:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movzwl (%eax), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mulw {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movw %ax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    movw %ax, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v1 = load i16, ptr %ptr1
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -1981,15 +1955,12 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi16_load2:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mulw (%ecx)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movw %ax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mulw (%edx)
+; WIN32-NEXT:    movw %ax, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v2 = load i16, ptr %ptr2
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -2032,15 +2003,12 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi32_load:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    movl %eax, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v1 = load i32, ptr %ptr1
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2083,15 +2051,12 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) {
 ;
 ; WIN32-LABEL: umuloi32_load2:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mull (%ecx)
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mull (%edx)
+; WIN32-NEXT:    movl %eax, (%ecx)
+; WIN32-NEXT:    seto %al
 ; WIN32-NEXT:    retl
   %v2 = load i32, ptr %ptr2
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -2138,33 +2103,35 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebp
-; WIN32-NEXT:    movl 4(%eax), %eax
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    movl (%eax), %esi
+; WIN32-NEXT:    movl 4(%eax), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %eax, (%edx)
+; WIN32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %al, %bl
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    seto %dl
+; WIN32-NEXT:    orb %bh, %dl
+; WIN32-NEXT:    orb %bl, %dl
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %ecx, 4(%eax)
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %dl, %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2216,32 +2183,34 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl (%ecx), %ebp
-; WIN32-NEXT:    movl 4(%ecx), %esi
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    movl (%eax), %ebp
+; WIN32-NEXT:    movl 4(%eax), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %eax, (%edx)
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %al, %bl
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
+; WIN32-NEXT:    seto %dl
+; WIN32-NEXT:    orb %bh, %dl
+; WIN32-NEXT:    orb %bl, %dl
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ecx, 4(%eax)
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %dl, %al
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/xor-icmp.ll b/llvm/test/CodeGen/X86/xor-icmp.ll
index 16a3b6cb855a7..b392155d3b432 100644
--- a/llvm/test/CodeGen/X86/xor-icmp.ll
+++ b/llvm/test/CodeGen/X86/xor-icmp.ll
@@ -110,10 +110,10 @@ define i1 @xor_not_bools(i1 zeroext %x, i1 zeroext %y) nounwind {
 define zeroext i1 @xor_not_cmps(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: xor_not_cmps:
 ; X86:       # %bb.0:
-; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
-; X86-NEXT:    setne %cl
 ; X86-NEXT:    cmpl $235, {{[0-9]+}}(%esp)
-; X86-NEXT:    sete %al
+; X86-NEXT:    sete %cl
+; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    xorb $1, %al
 ; X86-NEXT:    retl
@@ -137,14 +137,14 @@ define zeroext i1 @xor_not_cmps(i32 %x, i32 %y) nounwind {
 define zeroext i1 @xor_not_cmps_extra_use(i32 %x, i32 %y, ptr %p) nounwind {
 ; X86-LABEL: xor_not_cmps_extra_use:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
-; X86-NEXT:    setne %dl
 ; X86-NEXT:    cmpl $235, {{[0-9]+}}(%esp)
-; X86-NEXT:    sete %al
-; X86-NEXT:    xorb %dl, %al
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    sete %cl
+; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
+; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
 ; X86-NEXT:    xorb $1, %al
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll
index d50752e48d293..29c8bf7b70249 100644
--- a/llvm/test/CodeGen/X86/xor-lea.ll
+++ b/llvm/test/CodeGen/X86/xor-lea.ll
@@ -101,9 +101,9 @@ define i32 @xor_notsminval_i32(i32 %x) {
 define i64 @xor_sminval_i64(i64 %x) {
 ; X86-LABEL: xor_sminval_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_sminval_i64:
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index 2bef66825d8c0..193f2afde7fe4 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -493,9 +493,9 @@ define i32 @PR39657(ptr %p, i64 %x) {
 ; X86-LABEL: PR39657:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    movl (%eax,%ecx,4), %eax
+; X86-NEXT:    movl (%ecx,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LIN-LABEL: PR39657:
diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll b/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll
index f56044b857b93..4b2bb20cf173c 100644
--- a/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll
+++ b/llvm/test/CodeGen/X86/zero-call-used-regs-i386.ll
@@ -37,9 +37,9 @@ define dso_local zeroext i1 @test1(ptr nocapture noundef readonly %0) local_unna
 ; I386-NEXT:    movb mt_pivots_0, %ah
 ; I386-NEXT:    movb %ah, %al
 ; I386-NEXT:    decb %al
-; I386-NEXT:    movl mas_data_end___trans_tmp_2, %ecx
-; I386-NEXT:    movsbl %al, %edx
-; I386-NEXT:    cmpl $0, (%ecx,%edx,4)
+; I386-NEXT:    movsbl %al, %ecx
+; I386-NEXT:    movl mas_data_end___trans_tmp_2, %edx
+; I386-NEXT:    cmpl $0, (%edx,%ecx,4)
 ; I386-NEXT:    je .LBB0_5
 ; I386-NEXT:  # %bb.4:
 ; I386-NEXT:    movb %al, %ah
diff --git a/llvm/test/CodeGen/X86/zext-fold.ll b/llvm/test/CodeGen/X86/zext-fold.ll
index ad1ce14ad8435..5eb7fc2ca65eb 100644
--- a/llvm/test/CodeGen/X86/zext-fold.ll
+++ b/llvm/test/CodeGen/X86/zext-fold.ll
@@ -19,8 +19,8 @@ define i32 @test2(i8 %x) nounwind readnone {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andl $-32, %ecx
-; CHECK-NEXT:    orl $63, %eax
+; CHECK-NEXT:    orl $63, %ecx
+; CHECK-NEXT:    andl $-32, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    retl
   %A = and i8 %x, -32
@@ -37,9 +37,8 @@ declare void @use(i32, i8)
 define void @test3(i8 %x) nounwind readnone {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    subl $20, %esp
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-32, %eax
 ; CHECK-NEXT:    pushl %eax
diff --git a/llvm/test/CodeGen/X86/zext-lshr.ll b/llvm/test/CodeGen/X86/zext-lshr.ll
index 76f874663ed53..bc99f4548b512 100644
--- a/llvm/test/CodeGen/X86/zext-lshr.ll
+++ b/llvm/test/CodeGen/X86/zext-lshr.ll
@@ -91,9 +91,9 @@ define i64 @i64_zext_shift_i32_zext_i16(i16 %a0) nounwind {
 define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
 ; X86-LABEL: i128_zext_shift_i64_zext_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
@@ -115,9 +115,9 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
 define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind {
 ; X86-LABEL: i128_zext_shift_i64_zext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shrl $7, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll
index bc0981781df8f..26c7c09132c88 100644
--- a/llvm/test/CodeGen/X86/zext-shl.ll
+++ b/llvm/test/CodeGen/X86/zext-shl.ll
@@ -100,9 +100,9 @@ define i64 @i64_zext_shift_i32_zext_i16(i16 %a0) nounwind {
 define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
 ; X86-LABEL: i128_zext_shift_i64_zext_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
@@ -124,9 +124,9 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
 define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind {
 ; X86-LABEL: i128_zext_shift_i64_zext_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $7, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)



More information about the llvm-commits mailing list